Skip to content

Commit

Permalink
Add growth protocol UUID to biosample (#246)
Browse files Browse the repository at this point in the history
Compute bia_data_model.BioSample with uuid of specimen growth protol
  • Loading branch information
kbab authored Nov 14, 2024
1 parent bcfd595 commit d42de60
Show file tree
Hide file tree
Showing 14 changed files with 302 additions and 114 deletions.
5 changes: 1 addition & 4 deletions bia-ingest/bia_ingest/bia_object_creation_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from pathlib import Path
from uuid import UUID
from pydantic import BaseModel, ValidationError
import hashlib
import uuid
from typing import Any, Dict, List, Type

from bia_ingest.cli_logging import IngestionResult, log_failed_model_creation
from bia_ingest.config import settings


def filter_model_dictionary(dictionary: dict, target_model: Type[BaseModel]):
Expand All @@ -30,7 +27,7 @@ def dicts_to_api_models(
dicts: List[Dict[str, Any]],
api_model_class: Type[BaseModel],
valdiation_error_tracking: IngestionResult,
) -> BaseModel:
) -> List[BaseModel]:
"""
This function instantiates any API model given a dict of its attributes
Hence the use of the pydantic BaseModel which all API models are derived from in the type hinting
Expand Down
8 changes: 3 additions & 5 deletions bia-ingest/bia_ingest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from bia_ingest.ingest.study import get_study
from bia_ingest.ingest.dataset import get_dataset
from bia_ingest.ingest.file_reference import get_file_reference_by_dataset
from bia_ingest.ingest.specimen import get_specimen
from bia_ingest.ingest.image_acquisition_protocol import get_image_acquisition_protocol
from bia_ingest.ingest.annotation_method import get_annotation_method
from bia_ingest.persistence_strategy import (
Expand Down Expand Up @@ -76,6 +75,9 @@ def ingest(

get_study(submission, result_summary, persister=persister)

# Specimen, BioSample and Protocol (specimen growth protocol) depend on Dataset
# Specimen (note - this is very different from Biostudies.Specimen) artefacts are processed as part of bia_data_models.Dataset
# BioSamples are processed as part of Specimen and specimen growth protocol (Protocol) are processed as part of BioSample
datasets = get_dataset(submission, result_summary, persister=persister)

process_files = determine_file_processing(
Expand All @@ -95,10 +97,6 @@ def ingest(

get_image_acquisition_protocol(submission, result_summary, persister=persister)

# Specimen
# Biosample and Specimen artefacts are processed as part of bia_data_models.Specimen (note - this is very different from Biostudies.Specimen)
get_specimen(submission, result_summary, persister=persister)

get_annotation_method(submission, result_summary, persister=persister)

# typer.echo(study.model_dump_json(indent=2))
Expand Down
122 changes: 116 additions & 6 deletions bia-ingest/bia_ingest/ingest/biosample.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import logging
from typing import List, Any, Dict, Optional

from bia_ingest.ingest.generic_conversion_utils import get_associations_for_section
from bia_ingest.ingest.specimen_growth_protocol import (
get_specimen_growth_protocol,
)

from ..bia_object_creation_utils import (
dict_to_uuid,
dicts_to_api_models,
Expand Down Expand Up @@ -30,10 +35,6 @@ def get_biosample(
persister: Optional[PersistenceStrategy] = None,
) -> List[bia_data_model.BioSample]:
biosample_model_dicts = extract_biosample_dicts(submission)
#
# growth_protocols = get_specimen_growth_protocol(
# submission, result_summary, persister
# )

biosamples = dicts_to_api_models(
biosample_model_dicts,
Expand All @@ -52,7 +53,113 @@ def get_biosample(
return biosamples


def extract_biosample_dicts(submission: Submission) -> List[Dict[str, Any]]:
# TODO: Rewrite this function. What we need is
# get_biosample_for_association https://app.clickup.com/t/8696nan92
def get_biosample_by_study_component(
submission: Submission,
result_summary: dict,
persister: Optional[PersistenceStrategy] = None,
) -> Dict[str, bia_data_model.BioSample]:
"""Return biosample associated with growth protocol for s.component
Return a dict with study component title as key and biosample as
value. The biosample will be associated with the growth protocol
for the study component if one exists.
"""

biosample_model_dicts = extract_biosample_dicts(submission, filter_dict=False)

# Get growth protocols as UUIDs needed in biosample
# If we are persisting this call ensures the growth protocols
# are created and persisted.
growth_protocols = get_specimen_growth_protocol(
submission, result_summary, persister
)
growth_protocol_title_to_uuid_map = {
gp.title_id: gp.uuid for gp in growth_protocols
}

# Get associations to allow mapping to biosample
study_components = find_sections_recursive(
submission.section,
[
"Study Component",
],
[],
)

biosample_by_study_component = {}
for study_component in study_components:
study_component_name = next(
attr.value for attr in study_component.attributes if attr.name == "Name"
)
if study_component_name not in biosample_by_study_component:
biosample_by_study_component[study_component_name] = []
associations = get_associations_for_section(study_component)
for association in associations:
biosample_title = association.get("biosample", None)
specimen_title = association.get("specimen", None)
growth_protocol_uuid = None
if biosample_title and specimen_title:
growth_protocol_uuid = growth_protocol_title_to_uuid_map.get(
specimen_title, None
)
elif biosample_title:
logger.warning(
f"Could not find specimen association for biosample {biosample_title} in study component {study_component_name}"
)
else:
# This is to be expected in some cases. E.g. Annotation datasets ...
logger.warning(
f"Could not find biosample for study component {study_component_name}"
)
continue

# Attach specimen growth protocol uuid and recompute biosample uuid
# Currently assuming there should be only one growth protocol
# per biosample AND biosample titles are unique
# TODO: Log warning if above is not true.
biosample_model_dict = next(
model_dict
for model_dict in biosample_model_dicts
if model_dict["title_id"] == biosample_title
)
if growth_protocol_uuid:
biosample_model_dict["growth_protocol_uuid"] = growth_protocol_uuid
biosample_model_dict["uuid"] = generate_biosample_uuid(
biosample_model_dict
)
biosample_model_dict = filter_model_dictionary(
biosample_model_dict, bia_data_model.BioSample
)
biosample_model = dicts_to_api_models(
[
biosample_model_dict,
],
bia_data_model.BioSample,
result_summary[submission.accno],
)
biosample_by_study_component[study_component_name].append(
biosample_model[0]
)

# Save unique biosample models
biosamples = {}
for biosample_list in biosample_by_study_component.values():
biosamples |= {biosample.uuid: biosample for biosample in biosample_list}
biosamples = list(biosamples.values())
if persister and biosamples:
persister.persist(biosamples)
log_model_creation_count(
bia_data_model.BioSample, len(biosamples), result_summary[submission.accno]
)
return biosample_by_study_component


def extract_biosample_dicts(
submission: Submission,
filter_dict: bool = True,
) -> List[Dict[str, Any]]:
biosample_sections = find_sections_recursive(submission.section, ["Biosample"], [])

key_mapping = [
Expand Down Expand Up @@ -111,9 +218,11 @@ def extract_biosample_dicts(submission: Submission) -> List[Dict[str, Any]]:
model_dict[api_key].append(attr_dict[biostudies_key])

model_dict["accession_id"] = submission.accno
model_dict["growth_protocol_uuid"] = None
model_dict["uuid"] = generate_biosample_uuid(model_dict)
model_dict["version"] = 0
model_dict = filter_model_dictionary(model_dict, bia_data_model.BioSample)
if filter_dict:
model_dict = filter_model_dictionary(model_dict, bia_data_model.BioSample)
model_dicts.append(model_dict)

return model_dicts
Expand All @@ -129,5 +238,6 @@ def generate_biosample_uuid(biosample_dict: Dict[str, Any]) -> str:
"intrinsic_variable_description",
"extrinsic_variable_description",
"experimental_variable_description",
"growth_protocol_uuid",
]
return dict_to_uuid(biosample_dict, attributes_to_consider)
41 changes: 5 additions & 36 deletions bia-ingest/bia_ingest/ingest/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from ..cli_logging import log_failed_model_creation, log_model_creation_count
from .generic_conversion_utils import (
get_generic_section_as_list,
get_associations_for_section,
get_generic_section_as_dict,
)
import bia_ingest.ingest.study as study_conversion
Expand Down Expand Up @@ -58,40 +58,7 @@ def get_dataset(
datasets = []
for section in study_components:
attr_dict = attributes_to_dict(section.attributes)
key_mapping = [
(
"image_analysis",
"Image analysis",
None,
),
(
"image_correlation",
"Image correlation",
None,
),
(
"biosample",
"Biosample",
None,
),
(
"image_acquisition",
"Image acquisition",
None,
),
(
"specimen",
"Specimen",
None,
),
]
associations = get_generic_section_as_list(
section,
[
"Associations",
],
key_mapping,
)
associations = get_associations_for_section(section)

analysis_method_list = []

Expand Down Expand Up @@ -163,7 +130,9 @@ def get_dataset(
)
dataset.attribute.append(acquisition_process_uuid_as_attr)

subject = get_specimen_for_dataset(submission, dataset, result_summary)
subject = get_specimen_for_dataset(
submission, dataset, result_summary, persister
)
if subject:
subject_uuid_attr_dict = {
"provenance": semantic_models.AttributeProvenance("bia_ingest"),
Expand Down
43 changes: 42 additions & 1 deletion bia-ingest/bia_ingest/ingest/generic_conversion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
def get_generic_section_as_list(
root: Submission | Section,
section_name: List[str],
key_mapping: List[Tuple[str, str, str | None | List]],
key_mapping: List[Tuple[str, str, Union[str, None, List]]],
mapped_object: Optional[BaseModel] = None,
mapped_attrs_dict: Optional[Dict[str, Any]] = None,
valdiation_error_tracking: Optional[IngestionResult] = None,
Expand Down Expand Up @@ -134,3 +134,44 @@ def object_value_pair_to_dict(
object_dict[key].append(obj)

return object_dict


def get_associations_for_section(
section: Section,
) -> List[BaseModel | Dict[str, str | List[str]]]:
"""Return the associations for a section (assume Study Component)"""
key_mapping = [
(
"image_analysis",
"Image analysis",
None,
),
(
"image_correlation",
"Image correlation",
None,
),
(
"biosample",
"Biosample",
None,
),
(
"image_acquisition",
"Image acquisition",
None,
),
(
"specimen",
"Specimen",
None,
),
]
associations = get_generic_section_as_list(
section,
[
"Associations",
],
key_mapping,
)
return associations
Loading

0 comments on commit d42de60

Please sign in to comment.