Skip to content

Commit

Permalink
use new uuid creation functions (#259)
Browse files Browse the repository at this point in the history
* use new uuid creation functions

* fix indexing for file paths in cli test

* changed order of assert statements in cli test to be more sensible
  • Loading branch information
sherwoodf authored Dec 4, 2024
1 parent f29a39c commit c29d558
Show file tree
Hide file tree
Showing 28 changed files with 492 additions and 821 deletions.
19 changes: 0 additions & 19 deletions bia-ingest/bia_ingest/bia_object_creation_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from pydantic import BaseModel, ValidationError
import hashlib
import uuid
from typing import Any, Dict, List, Type, Optional

from bia_ingest.cli_logging import (
Expand All @@ -10,23 +8,6 @@
)


def filter_model_dictionary(dictionary: dict, target_model: Type[BaseModel]):
accepted_fields = target_model.model_fields.keys()
result_dict = {key: dictionary[key] for key in accepted_fields if key in dictionary}
return result_dict


def dict_to_uuid(my_dict: Dict[str, Any], attributes_to_consider: List[str]) -> str:
"""
Create uuid from specific keys in a dictionary
"""
# TODO: Need to use a canonical version for this function e.g. from API

seed = "".join([f"{my_dict[attr]}" for attr in attributes_to_consider])
hexdigest = hashlib.md5(seed.encode("utf-8")).hexdigest()
return str(uuid.UUID(version=4, hex=hexdigest))


def dict_to_api_model(
dict: dict[str, Any],
api_model_class: Type[BaseModel],
Expand Down
13 changes: 8 additions & 5 deletions bia-ingest/bia_ingest/biostudies/process_submission_v4.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,27 +23,28 @@

def process_submission_v4(submission, result_summary, process_files, persister):
study = get_study(submission, result_summary, persister=persister)
study_uuid = study.uuid

association_object_dict = {}
association_object_dict["image_acquisition_protocol"] = (
get_image_acquisition_protocol_map(
submission, result_summary, persister=persister
submission, study_uuid, result_summary, persister=persister
)
)
association_object_dict["annotation_method"] = get_annotation_method_as_map(
submission, result_summary, persister=persister
submission, study_uuid, result_summary, persister=persister
)
association_object_dict["specimen_imaging_preparation_protocol"] = (
get_specimen_imaging_preparation_protocol_as_map(
submission, result_summary, persister=persister
submission, study_uuid, result_summary, persister=persister
)
)
growth_protocol_map = get_growth_protocol_as_map(
submission, result_summary, persister=persister
submission, study_uuid, result_summary, persister=persister
)
association_object_dict["growth_protocol"] = growth_protocol_map
association_object_dict["bio_sample"] = get_bio_sample_as_map(
submission, growth_protocol_map, result_summary, persister=persister
submission, study_uuid, growth_protocol_map, result_summary, persister=persister
)

association_object_dict["image_analysis_method"] = get_image_analysis_method_as_map(
Expand All @@ -56,6 +57,7 @@ def process_submission_v4(submission, result_summary, process_files, persister):

datasets = get_dataset(
submission,
study_uuid,
association_object_dict,
result_summary,
persister=persister,
Expand All @@ -64,6 +66,7 @@ def process_submission_v4(submission, result_summary, process_files, persister):
if process_files:
get_file_reference_by_dataset(
submission,
study_uuid,
datasets,
result_summary,
persister=persister,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def find_sections_recursive(

def find_datasets_with_file_lists(
submission: Submission,
) -> List[Dict[str, List[Dict[str, Union[str, None, List[str]]]]]]:
) -> Dict[str, List[Dict[str, Union[str, None, List[str]]]]]:
"""
Return dict with dataset names as keys and file lists dicts as values
Expand Down
45 changes: 15 additions & 30 deletions bia-ingest/bia_ingest/biostudies/v4/annotation_method.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,27 @@
import logging
from typing import List, Any, Dict, Optional
from typing import Any, Dict, Optional
from uuid import UUID

from ...bia_object_creation_utils import (
dict_to_uuid,
dict_map_to_api_models,
filter_model_dictionary,
)
from bia_ingest.bia_object_creation_utils import dict_map_to_api_models
from bia_ingest.persistence_strategy import PersistenceStrategy

from ...cli_logging import log_model_creation_count
from ..submission_parsing_utils import (
from bia_ingest.biostudies.submission_parsing_utils import (
find_sections_recursive,
attributes_to_dict,
case_insensitive_get,
)
from ..api import (
from bia_ingest.biostudies.api import (
Submission,
)
from bia_shared_datamodels import bia_data_model, semantic_models
from ...persistence_strategy import PersistenceStrategy
from bia_shared_datamodels.uuid_creation import create_annotation_method_uuid

logger = logging.getLogger("__main__." + __name__)


def get_annotation_method_as_map(
submission: Submission,
study_uuid: UUID,
result_summary: dict,
persister: Optional[PersistenceStrategy] = None,
) -> dict[str, bia_data_model.AnnotationMethod]:
Expand All @@ -38,7 +36,9 @@ def get_annotation_method_as_map(
There is no annotation method object in biostudies, so no need to link it via associations.
"""

annotation_method_model_dicts = extract_annotation_method_dicts(submission)
annotation_method_model_dicts = extract_annotation_method_dicts(
submission, study_uuid
)
annotation_methods = dict_map_to_api_models(
annotation_method_model_dicts,
bia_data_model.AnnotationMethod,
Expand All @@ -55,6 +55,7 @@ def get_annotation_method_as_map(

def extract_annotation_method_dicts(
submission: Submission,
study_uuid: UUID,
) -> dict[str, dict[str, Any]]:
annotation_sections = find_sections_recursive(submission.section, ["Annotations"])

Expand Down Expand Up @@ -97,27 +98,11 @@ def extract_annotation_method_dicts(
semantic_models.AnnotationMethodType("other"),
]

model_dict["accno"] = section.__dict__.get("accno", "")
model_dict["accession_id"] = submission.accno
model_dict["uuid"] = generate_annotation_method_uuid(model_dict)
model_dict["version"] = 0
model_dict = filter_model_dictionary(
model_dict, bia_data_model.AnnotationMethod
model_dict["uuid"] = create_annotation_method_uuid(
model_dict["title_id"], study_uuid
)
model_dict["version"] = 0

model_dict_map[attr_dict["Title"]] = model_dict

return model_dict_map


def generate_annotation_method_uuid(protocol_dict: Dict[str, Any]) -> str:
attributes_to_consider = [
"accession_id",
"accno",
"title_id",
"protocol_description",
"annotation_criteria",
"annotation_coverage",
"method_type",
]
return dict_to_uuid(protocol_dict, attributes_to_consider)
66 changes: 28 additions & 38 deletions bia-ingest/bia_ingest/biostudies/v4/bio_sample.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,37 @@
import logging
from typing import List, Any, Optional
from uuid import UUID
from copy import deepcopy

from bia_ingest.biostudies.generic_conversion_utils import (
get_associations_for_section,
Association,
)
from copy import deepcopy

from bia_ingest.bia_object_creation_utils import (
dict_to_uuid,
dicts_to_api_models,
dict_map_to_api_models,
filter_model_dictionary,
)

from bia_ingest.biostudies.submission_parsing_utils import (
find_sections_recursive,
attributes_to_dict,
case_insensitive_get,
)
from bia_ingest.biostudies.api import Submission, Section


from bia_ingest.bia_object_creation_utils import (
dicts_to_api_models,
dict_map_to_api_models,
)
from bia_ingest.cli_logging import IngestionResult
from bia_shared_datamodels import bia_data_model, semantic_models
from bia_ingest.persistence_strategy import PersistenceStrategy


from bia_shared_datamodels import bia_data_model, semantic_models
from bia_shared_datamodels.uuid_creation import create_bio_sample_uuid

logger = logging.getLogger("__main__." + __name__)


def get_bio_sample_as_map(
submission: Submission,
study_uuid: UUID,
growth_protocol_map: dict[str, bia_data_model.Protocol],
result_summary: dict,
persister: Optional[PersistenceStrategy] = None,
Expand All @@ -49,7 +52,7 @@ def get_bio_sample_as_map(
"""

biosample_model_dicts = extract_biosample_dicts(
submission, growth_protocol_map, result_summary
submission, study_uuid, growth_protocol_map, result_summary
)

biosamples = dict_map_to_api_models(
Expand All @@ -66,6 +69,7 @@ def get_bio_sample_as_map(

def extract_biosample_dicts(
submission: Submission,
study_uuid: UUID,
growth_protocol_map: dict[str, bia_data_model.BioSample],
result_summary: dict,
) -> list[dict[str, Any]]:
Expand All @@ -74,7 +78,6 @@ def extract_biosample_dicts(
key_mapping = [
("title_id", "Title", ""),
("biological_entity_description", "Biological entity", ""),
("organism", "Organism", ""),
]

model_dicts_map = {}
Expand Down Expand Up @@ -102,13 +105,11 @@ def extract_biosample_dicts(
if biostudies_key in attr_dict:
model_dict[api_key].append(attr_dict[biostudies_key])

model_dict["accno"] = section.accno
model_dict["organism_classification"] = [
t.model_dump()
for t in get_taxon(model_dict, section, result_summary[submission.accno])
for t in get_taxon(attr_dict, section, result_summary[submission.accno])
]

model_dict["accession_id"] = submission.accno
bs_without_gp, growth_protocol_uuids = check_for_growth_protocol_uuids(
attr_dict["Title"], submission, growth_protocol_map
)
Expand All @@ -117,37 +118,26 @@ def extract_biosample_dicts(
for gp_uuid in growth_protocol_uuids:
model_dict_with_gp = deepcopy(model_dict)
model_dict_with_gp["growth_protocol_uuid"] = gp_uuid[1]
model_dict_with_gp["uuid"] = generate_biosample_uuid(model_dict_with_gp)
model_dict_with_gp = filter_model_dictionary(
model_dict_with_gp, bia_data_model.BioSample
model_dict_with_gp["uuid"] = create_bio_sample_uuid(
model_dict_with_gp["title_id"],
study_uuid,
model_dict_with_gp["growth_protocol_uuid"],
)
model_dicts_map[attr_dict["Title"] + "." + gp_uuid[0]] = model_dict_with_gp

if bs_without_gp:
model_dict["growth_protocol_uuid"] = None
model_dict["uuid"] = generate_biosample_uuid(model_dict)
model_dict = filter_model_dictionary(model_dict, bia_data_model.BioSample)
model_dict["uuid"] = create_bio_sample_uuid(
model_dict["title_id"], study_uuid
)
model_dicts_map[attr_dict["Title"]] = model_dict
return model_dicts_map


def generate_biosample_uuid(biosample_dict: dict[str, Any]) -> str:
attributes_to_consider = [
"accession_id",
"accno",
"title_id",
"organism_classification",
"biological_entity_description",
"intrinsic_variable_description",
"extrinsic_variable_description",
"experimental_variable_description",
"growth_protocol_uuid",
]
return dict_to_uuid(biosample_dict, attributes_to_consider)


def get_taxon(
model_dict: dict, biosample_section: Section, ingestion_result: IngestionResult
biosample_attr_dict: dict,
biosample_section: Section,
ingestion_result: IngestionResult,
) -> list[semantic_models.Taxon]:
taxon_dicts = []

Expand All @@ -169,7 +159,7 @@ def get_taxon(
taxon_dicts.append(model_dict)

else:
organism: str = model_dict.pop("organism", "")
organism: str = biosample_attr_dict.pop("Organism", "")
try:
organism_scientific_name, organism_common_name = organism.split("(")
organism_common_name = organism_common_name.rstrip(")")
Expand All @@ -186,7 +176,7 @@ def get_taxon(
)

taxon = dicts_to_api_models(taxon_dicts, semantic_models.Taxon, ingestion_result)

return taxon


Expand Down
Loading

0 comments on commit c29d558

Please sign in to comment.