Skip to content

Commit

Permalink
Model updates, and some conversion logic (#123)
Browse files Browse the repository at this point in the history
* model updates to standardise names further and to account for which fields will actually be generated endpoints

* model updates and added specimen growth, preparation, and image acquisition conversion logic

* added logic to generate annotation method objects

* tidied up imports

* created empty annotation file list to make tests pass

* moved file reference conversion to it's own file, fixed imports of shared models, and fixed file_name -> file_path as per model change for file references

* updated models and ingest code
  • Loading branch information
sherwoodf authored Jul 22, 2024
1 parent d4d4bf7 commit 66527f0
Show file tree
Hide file tree
Showing 18 changed files with 1,138 additions and 830 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import logging
from typing import List, Any, Dict
from .utils import (
dicts_to_api_models,
find_sections_recursive,
dict_to_uuid,
persist
)
from ..biostudies import (
Submission,
attributes_to_dict,
)
from bia_shared_datamodels import bia_data_model

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def get_annotation_method(
submission: Submission, persist_artefacts=False
) -> List[bia_data_model.AnnotationMethod]:

annotation_method_model_dicts = extract_annotation_method_dicts(submission)
annotation_methods = dicts_to_api_models(annotation_method_model_dicts, bia_data_model.AnnotationMethod)

if persist_artefacts and annotation_methods:
persist(annotation_methods, "annotation_method", submission.accno)

return annotation_methods


def extract_annotation_method_dicts(submission: Submission) -> List[Dict[str, Any]]:
annotation_sections = find_sections_recursive(submission.section, ["Annotations"], [])

key_mapping = [
("title_id", "Name", ""),
("protocol_description", "Annotation overview", ""),
("annotation_criteria", "Annotation criteria", ""),
("annotation_coverage", "Annotation coverage", ""),
("method_type", "Annotation method", "other"),
]

model_dicts = []
for section in annotation_sections:
attr_dict = attributes_to_dict(section.attributes)

model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping}

# TODO: change template to get source dataset information
model_dict["source_dataset"] = []

model_dict["accno"] = section.__dict__.get("accno", "")
model_dict["accession_id"] = submission.accno
model_dict["uuid"] = generate_annotation_method_uuid(model_dict)
model_dicts.append(model_dict)

return model_dicts


def generate_annotation_method_uuid(protocol_dict: Dict[str, Any]) -> str:
attributes_to_consider = [
"accession_id",
"accno",
"title_id",
"protocol_description",
"annotation_criteria",
"annotation_coverage",
"method_type",
"source_dataset"
]
return dict_to_uuid(protocol_dict, attributes_to_consider)
22 changes: 6 additions & 16 deletions bia-ingest-shared-models/bia_ingest_sm/conversion/biosample.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import logging
from pathlib import Path
from typing import List, Any, Dict
from .utils import (
dicts_to_api_models,
find_sections_recursive,
dict_to_uuid
dict_to_uuid,
persist
)
from ..biostudies import (
Submission,
attributes_to_dict,
)
from ..config import settings
from src.bia_models import bia_data_model, semantic_models
from bia_shared_datamodels import bia_data_model, semantic_models

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
Expand All @@ -25,14 +24,7 @@ def get_biosample(
biosamples = dicts_to_api_models(biosample_model_dicts, bia_data_model.BioSample)

if persist_artefacts and biosamples:
output_dir = Path(settings.bia_data_dir) / "biosamples" / submission.accno
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")
for biosample in biosamples:
output_path = output_dir / f"{biosample.uuid}.json"
output_path.write_text(biosample.model_dump_json(indent=2))
logger.info(f"Written {output_path}")
persist(biosamples, "biosamples", submission.accno)
return biosamples


Expand All @@ -41,7 +33,7 @@ def extract_biosample_dicts(submission: Submission) -> List[Dict[str, Any]]:

key_mapping = [
("title_id", "Title", ""),
("description", "Description", ""),
("biological_entity_description", "Biological entity", ""),
("organism", "Organism", ""),
]

Expand Down Expand Up @@ -93,9 +85,7 @@ def generate_biosample_uuid(biosample_dict: Dict[str, Any]) -> str:
"accno",
"title_id",
"organism_classification",
"description",
# TODO: Discuss including below in semantic_models.BioSample
# "biological_entity",
"biological_entity_description",
"intrinsic_variable_description",
"extrinsic_variable_description",
"experimental_variable_description",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
import logging
from pathlib import Path
from typing import List, Dict
from typing import List, Dict, Any
from .utils import (
find_sections_recursive,
get_generic_section_as_list,
dict_to_uuid,
get_generic_section_as_dict
get_generic_section_as_dict,
persist
)
from .file_reference import get_file_reference_by_study_component
import bia_ingest_sm.conversion.biosample as biosample_conversion
import bia_ingest_sm.conversion.study as study_conversion
from ..biostudies import (
Submission,
attributes_to_dict,
find_file_lists_in_submission,
flist_from_flist_fname,
file_uri,
)
from ..config import settings
from src.bia_models import bia_data_model, semantic_models
from bia_shared_datamodels import bia_data_model, semantic_models

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -58,9 +56,6 @@ def get_experimental_imaging_dataset(
for section in study_components:
attr_dict = attributes_to_dict(section.attributes)
key_mapping = [
("biosample", "Biosample", None,),
("specimen", "Specimen", None,),
("image_acquisition", "Image acquisition", None,),
("image_analysis", "Image analysis", None,),
("image_correlation", "Image correlation", None,),
]
Expand All @@ -69,19 +64,18 @@ def get_experimental_imaging_dataset(
)

analysis_method_list = []
biosample_list = []
image_acquisition_method_list = []
correlation_method_list = []
specimen_preparation_method_list = []
biosample_list = []

#TODO: move this to main CLI code to make object generation more independent
if len(associations) > 0:
# Image Analysis Method
analysis_methods_from_associations = [
a.get("image_analysis") for a in associations
]
for analysis_method in analysis_method_dict.values():
if (
analysis_method.method_description
analysis_method.protocol_description
in analysis_methods_from_associations
):
analysis_method_list.append(analysis_method)
Expand All @@ -92,44 +86,23 @@ def get_experimental_imaging_dataset(
if biosample in biosamples_in_submission_uuid:
biosample_list.extend(biosamples_in_submission_uuid[biosample])


section_name = attr_dict["Name"]
study_component_file_references = file_reference_uuids.get(section_name, [])
model_dict = {
"title_id": section_name,
# "description": attr_dict["Description"],
"submitted_in_study": study_conversion.get_study_uuid(submission),
"file": study_component_file_references,
"image": [],
"specimen_preparation_method": specimen_preparation_method_list,
"acquisition_method": image_acquisition_method_list,
"biological_entity": biosample_list,
"description": attr_dict["Description"],
"submitted_in_study_uuid": study_conversion.get_study_uuid(submission),
"analysis_method": analysis_method_list,
"correlation_method": correlation_method_list,
"file_reference_count": len(study_component_file_references),
"image_count": 0,
"example_image_uri": [],
}
# TODO: Add 'description' to computation of uuid (Maybe accno?)
model_dict["uuid"] = dict_to_uuid(
model_dict, ["title_id", "submitted_in_study",]
)
model_dict["uuid"] = generate_experimental_imaging_dataset_uuid(model_dict)
experimental_imaging_dataset.append(
bia_data_model.ExperimentalImagingDataset.model_validate(model_dict)
)

if persist_artefacts and experimental_imaging_dataset:
output_dir = (
Path(settings.bia_data_dir)
/ "experimental_imaging_datasets"
/ submission.accno
)
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")
for dataset in experimental_imaging_dataset:
output_path = output_dir / f"{dataset.uuid}.json"
output_path.write_text(dataset.model_dump_json(indent=2))
logger.info(f"Written {output_path}")
persist(experimental_imaging_dataset, "experimental_imaging_dataset", submission.accno)

return experimental_imaging_dataset

Expand All @@ -139,7 +112,7 @@ def get_image_analysis_method(
) -> Dict[str, semantic_models.ImageAnalysisMethod]:

key_mapping = [
("method_description", "Title", None,),
("protocol_description", "Title", None,),
("features_analysed", "Image analysis overview", None,),
]

Expand All @@ -151,48 +124,10 @@ def get_image_analysis_method(
)


def get_file_reference_by_study_component(
submission: Submission, persist_artefacts: bool = False
) -> Dict[str, List[bia_data_model.FileReference]]:
"""
Return Dict of list of file references in study components.
"""
file_list_dicts = find_file_lists_in_submission(submission)
fileref_to_study_components = {}

if persist_artefacts:
output_dir = Path(settings.bia_data_dir) / "file_references" / submission.accno
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")

for file_list_dict in file_list_dicts:
study_component_name = file_list_dict["Name"]
if study_component_name not in fileref_to_study_components:
fileref_to_study_components[study_component_name] = []

fname = file_list_dict["File List"]
files_in_fl = flist_from_flist_fname(submission.accno, fname)
for f in files_in_fl:
file_dict = {
"accession_id": submission.accno,
"file_name": str(f.path),
"size_in_bytes": str(f.size),
}
fileref_uuid = dict_to_uuid(
file_dict, ["accession_id", "file_name", "size_in_bytes"]
)
fileref_to_study_components[study_component_name].append(fileref_uuid)
# TODO - Not storing submission_dataset uuid yet!!!
if persist_artefacts:
file_dict["uuid"] = fileref_uuid
file_dict["uri"] = file_uri(submission.accno, f)
file_dict["submission_dataset"] = fileref_uuid
file_dict["format"] = f.type
file_dict["attribute"] = attributes_to_dict(f.attributes)
file_reference = bia_data_model.FileReference.model_validate(file_dict)
output_path = output_dir / f"{fileref_uuid}.json"
output_path.write_text(file_reference.model_dump_json(indent=2))
logger.info(f"Written {output_path}")

return fileref_to_study_components
def generate_experimental_imaging_dataset_uuid(experimental_imaging_dataset_dict: Dict[str, Any]) -> str:
# TODO: Add 'description' to computation of uuid (Maybe accno?)
attributes_to_consider = [
"title_id",
"submitted_in_study_uuid",
]
return dict_to_uuid(experimental_imaging_dataset_dict, attributes_to_consider)
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import logging
from pathlib import Path
from typing import List, Dict
from .utils import (
dict_to_uuid,
)
from ..biostudies import (
Submission,
attributes_to_dict,
find_file_lists_in_submission,
flist_from_flist_fname,
file_uri,
)
from ..config import settings
from bia_shared_datamodels import bia_data_model

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

def get_file_reference_by_study_component(
submission: Submission, persist_artefacts: bool = False
) -> Dict[str, List[bia_data_model.FileReference]]:
"""
Return Dict of list of file references in study components.
"""
file_list_dicts = find_file_lists_in_submission(submission)
fileref_to_study_components = {}

if persist_artefacts:
output_dir = Path(settings.bia_data_dir) / "file_references" / submission.accno
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")

for file_list_dict in file_list_dicts:
study_component_name = file_list_dict["Name"]
if study_component_name not in fileref_to_study_components:
fileref_to_study_components[study_component_name] = []

fname = file_list_dict["File List"]
files_in_fl = flist_from_flist_fname(submission.accno, fname)
for f in files_in_fl:
file_dict = {
"accession_id": submission.accno,
"file_path": str(f.path),
"size_in_bytes": str(f.size),
}
fileref_uuid = dict_to_uuid(
file_dict, ["accession_id", "file_path", "size_in_bytes"]
)
fileref_to_study_components[study_component_name].append(fileref_uuid)
# TODO - Not storing submission_dataset uuid yet!!!
if persist_artefacts:
file_dict["uuid"] = fileref_uuid
file_dict["uri"] = file_uri(submission.accno, f)
file_dict["submission_dataset"] = fileref_uuid
file_dict["format"] = f.type
file_dict["attribute"] = attributes_to_dict(f.attributes)
file_reference = bia_data_model.FileReference.model_validate(file_dict)
output_path = output_dir / f"{fileref_uuid}.json"
output_path.write_text(file_reference.model_dump_json(indent=2))
logger.info(f"Written {output_path}")

return fileref_to_study_components
Loading

0 comments on commit 66527f0

Please sign in to comment.