Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model updates, and some conversion logic #123

Merged
merged 8 commits into from
Jul 22, 2024
18 changes: 5 additions & 13 deletions bia-ingest-shared-models/bia_ingest_sm/conversion/biosample.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from .utils import (
dicts_to_api_models,
find_sections_recursive,
dict_to_uuid
dict_to_uuid,
persist
)
from ..biostudies import (
Submission,
Expand All @@ -25,14 +26,7 @@ def get_biosample(
biosamples = dicts_to_api_models(biosample_model_dicts, bia_data_model.BioSample)

if persist_artefacts and biosamples:
output_dir = Path(settings.bia_data_dir) / "biosamples" / submission.accno
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")
for biosample in biosamples:
output_path = output_dir / f"{biosample.uuid}.json"
output_path.write_text(biosample.model_dump_json(indent=2))
logger.info(f"Written {output_path}")
persist(biosamples, "biosamples", submission.accno)
return biosamples


Expand All @@ -41,7 +35,7 @@ def extract_biosample_dicts(submission: Submission) -> List[Dict[str, Any]]:

key_mapping = [
("title_id", "Title", ""),
("description", "Description", ""),
("biological_entity_description", "Biological entity", ""),
("organism", "Organism", ""),
]

Expand Down Expand Up @@ -93,9 +87,7 @@ def generate_biosample_uuid(biosample_dict: Dict[str, Any]) -> str:
"accno",
"title_id",
"organism_classification",
"description",
# TODO: Discuss including below in semantic_models.BioSample
# "biological_entity",
"biological_entity_description",
"intrinsic_variable_description",
"extrinsic_variable_description",
"experimental_variable_description",
Expand Down
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to revisit the logic of this file in light of the fact that we are not storing the list of file_references anymore. We may have to trigger the generation of file_references after obtaining the uuid for the experimental dataset, so we can pass this to the function that creates file_references, allowing them to point to their parent expermental imaging dataset.

Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
find_sections_recursive,
get_generic_section_as_list,
dict_to_uuid,
get_generic_section_as_dict
get_generic_section_as_dict,
persist
)
import bia_ingest_sm.conversion.biosample as biosample_conversion
import bia_ingest_sm.conversion.study as study_conversion
Expand Down Expand Up @@ -69,10 +70,11 @@ def get_experimental_imaging_dataset(
)

analysis_method_list = []
biosample_list = []
image_acquisition_method_list = []
correlation_method_list = []
specimen_preparation_method_list = []
biosample_list = []
image_acquisition_list = []
specimen_preparation_protocol_list = []
specimen_growth_protocol_list = []

if len(associations) > 0:
# Image Analysis Method
Expand All @@ -81,7 +83,7 @@ def get_experimental_imaging_dataset(
]
for analysis_method in analysis_method_dict.values():
if (
analysis_method.method_description
analysis_method.protocol_description
in analysis_methods_from_associations
):
analysis_method_list.append(analysis_method)
Expand All @@ -96,12 +98,11 @@ def get_experimental_imaging_dataset(
study_component_file_references = file_reference_uuids.get(section_name, [])
model_dict = {
"title_id": section_name,
# "description": attr_dict["Description"],
"description": attr_dict["Description"],
"submitted_in_study": study_conversion.get_study_uuid(submission),
"file": study_component_file_references,
"image": [],
"specimen_preparation_method": specimen_preparation_method_list,
"acquisition_method": image_acquisition_method_list,
"specimen_imaging_preparation_protocol": specimen_preparation_protocol_list,
"acquisition_process": image_acquisition_list,
"specimen_growth_protocol": specimen_growth_protocol_list,
"biological_entity": biosample_list,
"analysis_method": analysis_method_list,
"correlation_method": correlation_method_list,
Expand All @@ -118,18 +119,7 @@ def get_experimental_imaging_dataset(
)

if persist_artefacts and experimental_imaging_dataset:
output_dir = (
Path(settings.bia_data_dir)
/ "experimental_imaging_datasets"
/ submission.accno
)
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")
for dataset in experimental_imaging_dataset:
output_path = output_dir / f"{dataset.uuid}.json"
output_path.write_text(dataset.model_dump_json(indent=2))
logger.info(f"Written {output_path}")
persist(experimental_imaging_dataset, "experimental_imaging_dataset", submission.accno)

return experimental_imaging_dataset

Expand All @@ -139,7 +129,7 @@ def get_image_analysis_method(
) -> Dict[str, semantic_models.ImageAnalysisMethod]:

key_mapping = [
("method_description", "Title", None,),
("protocol_description", "Title", None,),
("features_analysed", "Image analysis overview", None,),
]

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import logging
from pathlib import Path
from typing import List, Any, Dict
from .utils import (
dicts_to_api_models,
find_sections_recursive,
dict_to_uuid,
persist
)
from ..biostudies import (
Submission,
attributes_to_dict,
)
from ..config import settings
from src.bia_models import bia_data_model, semantic_models

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def get_image_acquisition(
submission: Submission, persist_artefacts=False
) -> List[bia_data_model.Specimen]:

image_acquisition_model_dicts = extract_image_acquisition_dicts(submission)
image_acquisitions = dicts_to_api_models(image_acquisition_model_dicts, bia_data_model.ImageAcquisition)

if persist_artefacts and image_acquisitions:
persist(image_acquisitions, "specimen_growth_protocol", submission.accno)

return image_acquisitions


def extract_image_acquisition_dicts(submission: Submission) -> List[Dict[str, Any]]:
acquisition_sections = find_sections_recursive(submission.section, ["Image acquisition"], [])

key_mapping = [
("title_id", "Title", ""),
("protocol_description", "Image acquisition parameters", ""),
("imaging_instrument_description", "Imaging instrument", ""),
("imaging_method_name", "Imaging method", ""),
]

model_dicts = []
for section in acquisition_sections:
attr_dict = attributes_to_dict(section.attributes)

model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping}

# TODO: change template / create logic to lookup the fbbi ID
model_dict["fbbi_id"] = []

model_dict["accno"] = section.__dict__.get("accno", "")
model_dict["accession_id"] = submission.accno
model_dict["uuid"] = generate_image_acquisition_uuid(model_dict)
model_dicts.append(model_dict)

return model_dicts


def generate_image_acquisition_uuid(protocol_dict: Dict[str, Any]) -> str:
attributes_to_consider = [
"accession_id",
"accno",
"title_id",
"protocol_description",
"imaging_instrument_description",
"imaging_method_name",
"fbbi_id"
]
return dict_to_uuid(protocol_dict, attributes_to_consider)
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import logging
from pathlib import Path
from typing import List, Any, Dict
from .utils import (
dicts_to_api_models,
find_sections_recursive,
dict_to_uuid,
persist
)
from ..biostudies import (
Submission,
attributes_to_dict,
)
from ..config import settings
from src.bia_models import bia_data_model, semantic_models

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def get_specimen_growth_protocol(
submission: Submission, persist_artefacts=False
) -> List[bia_data_model.Specimen]:

specimen_growth_protocol_model_dicts = extract_specimen_growth_protocol_dicts(submission)
specimen_growth_protocols = dicts_to_api_models(specimen_growth_protocol_model_dicts, bia_data_model.SpecimenGrowthProtocol)

if persist_artefacts and specimen_growth_protocols:
persist(specimen_growth_protocols, "specimen_growth_protocol", submission.accno)

return specimen_growth_protocols


def extract_specimen_growth_protocol_dicts(submission: Submission) -> List[Dict[str, Any]]:
specimen_sections = find_sections_recursive(submission.section, ["Specimen"], [])

key_mapping = [
("title_id", "Title", ""),
("protocol_description", "Growth protocol", ""),
]

model_dicts = []
for section in specimen_sections:
attr_dict = attributes_to_dict(section.attributes)

model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping}

model_dict["accno"] = section.__dict__.get("accno", "")
model_dict["accession_id"] = submission.accno
model_dict["uuid"] = generate_specimen_growth_protocol_uuid(model_dict)
model_dicts.append(model_dict)

return model_dicts


def generate_specimen_growth_protocol_uuid(protocol_dict: Dict[str, Any]) -> str:
attributes_to_consider = [
"accession_id",
"accno",
"title_id",
"protocol_description",
]
return dict_to_uuid(protocol_dict, attributes_to_consider)
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import logging
from pathlib import Path
from typing import List, Any, Dict
from .utils import (
dicts_to_api_models,
find_sections_recursive,
dict_to_uuid,
persist
)
from ..biostudies import (
Submission,
attributes_to_dict,
)
from ..config import settings
from src.bia_models import bia_data_model, semantic_models

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def get_specimen_preparation_protocol(
submission: Submission, persist_artefacts=False
) -> List[bia_data_model.Specimen]:

specimen_preparation_protocol_model_dicts = extract_specimen_preparation_protocol_dicts(submission)
specimen_preparation_protocols = dicts_to_api_models(specimen_preparation_protocol_model_dicts, bia_data_model.SpecimenPrepartionProtocol)

if persist_artefacts and specimen_preparation_protocols:
persist(specimen_preparation_protocols, "specimen_imaging_protocol", submission.accno)

return specimen_preparation_protocols


def extract_specimen_preparation_protocol_dicts(submission: Submission) -> List[Dict[str, Any]]:
specimen_sections = find_sections_recursive(submission.section, ["Specimen"], [])

key_mapping = [
("title_id", "Title", ""),
("protocol_description", "Sample preparation protocol", ""),
]

model_dicts = []
for section in specimen_sections:
attr_dict = attributes_to_dict(section.attributes)

model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping}

# Currently generates empty list as we need to change the submission template
model_dict["signal_channel_information"] = []

model_dict["accno"] = section.__dict__.get("accno", "")
model_dict["accession_id"] = submission.accno
model_dict["uuid"] = generate_specimen_preparation_uuid(model_dict)
model_dicts.append(model_dict)

return model_dicts


def generate_specimen_preparation_uuid(protocol_dict: Dict[str, Any]) -> str:
attributes_to_consider = [
"accession_id",
"accno",
"title_id",
"protocol_description",
]
return dict_to_uuid(protocol_dict, attributes_to_consider)
3 changes: 2 additions & 1 deletion bia-ingest-shared-models/bia_ingest_sm/conversion/study.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think re module is used

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's used in:


def get_licence(study_attributes: Dict[str, Any]) -> semantic_models.LicenceType:
    """
    Return enum version of licence of study
    """
    licence = re.sub(r"\s", "_", study_attributes.get("License", "CC0"))
    return semantic_models.LicenceType(licence)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But i guess we've changed the enums now, so we don't need that?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I missed this - no I think we still need it!

Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
get_generic_section_as_dict,
mattributes_to_dict,
dict_to_uuid,
find_sections_recursive
find_sections_recursive,
persist
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't end up using this - so should remove the unused import (will do it in this PR if other changes are needed)

)
import bia_ingest_sm.conversion.experimental_imaging_dataset as eid_conversion
from ..biostudies import (
Expand Down
11 changes: 11 additions & 0 deletions bia-ingest-shared-models/bia_ingest_sm/conversion/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,14 @@ def dict_to_uuid(my_dict: Dict[str, Any], attributes_to_consider: List[str]) ->
seed = "".join([f"{my_dict[attr]}" for attr in attributes_to_consider])
hexdigest = hashlib.md5(seed.encode("utf-8")).hexdigest()
return str(uuid.UUID(version=4, hex=hexdigest))


def persist(object_list: List, object_path: str, sumbission_accno: str):
output_dir = Path(settings.bia_data_dir) / object_path / sumbission_accno
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")
for object in object_list:
output_path = output_dir / f"{object.uuid}.json"
output_path.write_text(object.model_dump_json(indent=2))
logger.info(f"Written {output_path}")
17 changes: 16 additions & 1 deletion bia-ingest-shared-models/test/test_shared_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
from bia_ingest_sm.conversion import (
biosample,
experimental_imaging_dataset,
study
study,
specimen_imaging_protocol,
specimen_growth_protocol,
image_acquisition,
)
from bia_ingest_sm.biostudies import requests

Expand Down Expand Up @@ -35,6 +38,18 @@ def mock_request_get(flist_url: str) -> Dict[str, str]:
utils.get_test_experimental_imaging_dataset,
experimental_imaging_dataset.get_experimental_imaging_dataset,
),
(
utils.get_test_specimen_preparation_protocol,
specimen_imaging_protocol.get_specimen_preparation_protocol,
),
(
utils.get_test_specimen_growth_protocol,
specimen_growth_protocol.get_specimen_growth_protocol,
),
(
utils.get_test_image_acquisition,
image_acquisition.get_image_acquisition,
),
# Not testing as we need to deal with links that are not proper
# urls
# (utils.get_test_external_reference, conversion.get_external_reference,),
Expand Down
Loading
Loading