Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model updates, and some conversion logic #123

Merged
merged 8 commits into from
Jul 22, 2024
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import logging
from typing import List, Any, Dict
from .utils import (
dicts_to_api_models,
find_sections_recursive,
dict_to_uuid,
persist
)
from ..biostudies import (
Submission,
attributes_to_dict,
)
from bia_shared_datamodels import bia_data_model

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def get_annotation_method(
submission: Submission, persist_artefacts=False
) -> List[bia_data_model.AnnotationMethod]:

annotation_method_model_dicts = extract_annotation_method_dicts(submission)
annotation_methods = dicts_to_api_models(annotation_method_model_dicts, bia_data_model.AnnotationMethod)

if persist_artefacts and annotation_methods:
persist(annotation_methods, "annotation_method", submission.accno)

return annotation_methods


def extract_annotation_method_dicts(submission: Submission) -> List[Dict[str, Any]]:
annotation_sections = find_sections_recursive(submission.section, ["Annotations"], [])

key_mapping = [
("title_id", "Name", ""),
("protocol_description", "Annotation overview", ""),
("annotation_criteria", "Annotation criteria", ""),
("annotation_coverage", "Annotation coverage", ""),
("method_type", "Annotation method", "other"),
]

model_dicts = []
for section in annotation_sections:
attr_dict = attributes_to_dict(section.attributes)

model_dict = {k: attr_dict.get(v, default) for k, v, default in key_mapping}

# TODO: change template to get source dataset information
model_dict["source_dataset"] = []

model_dict["accno"] = section.__dict__.get("accno", "")
model_dict["accession_id"] = submission.accno
model_dict["uuid"] = generate_annotation_method_uuid(model_dict)
model_dicts.append(model_dict)

return model_dicts


def generate_annotation_method_uuid(protocol_dict: Dict[str, Any]) -> str:
attributes_to_consider = [
"accession_id",
"accno",
"title_id",
"protocol_description",
"annotation_criteria",
"annotation_coverage",
"method_type",
"source_dataset"
]
return dict_to_uuid(protocol_dict, attributes_to_consider)
22 changes: 6 additions & 16 deletions bia-ingest-shared-models/bia_ingest_sm/conversion/biosample.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import logging
from pathlib import Path
from typing import List, Any, Dict
from .utils import (
dicts_to_api_models,
find_sections_recursive,
dict_to_uuid
dict_to_uuid,
persist
)
from ..biostudies import (
Submission,
attributes_to_dict,
)
from ..config import settings
from src.bia_models import bia_data_model, semantic_models
from bia_shared_datamodels import bia_data_model, semantic_models

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
Expand All @@ -25,14 +24,7 @@ def get_biosample(
biosamples = dicts_to_api_models(biosample_model_dicts, bia_data_model.BioSample)

if persist_artefacts and biosamples:
output_dir = Path(settings.bia_data_dir) / "biosamples" / submission.accno
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")
for biosample in biosamples:
output_path = output_dir / f"{biosample.uuid}.json"
output_path.write_text(biosample.model_dump_json(indent=2))
logger.info(f"Written {output_path}")
persist(biosamples, "biosamples", submission.accno)
return biosamples


Expand All @@ -41,7 +33,7 @@ def extract_biosample_dicts(submission: Submission) -> List[Dict[str, Any]]:

key_mapping = [
("title_id", "Title", ""),
("description", "Description", ""),
("biological_entity_description", "Biological entity", ""),
("organism", "Organism", ""),
]

Expand Down Expand Up @@ -93,9 +85,7 @@ def generate_biosample_uuid(biosample_dict: Dict[str, Any]) -> str:
"accno",
"title_id",
"organism_classification",
"description",
# TODO: Discuss including below in semantic_models.BioSample
# "biological_entity",
"biological_entity_description",
"intrinsic_variable_description",
"extrinsic_variable_description",
"experimental_variable_description",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
import logging
from pathlib import Path
from typing import List, Dict
from typing import List, Dict, Any
from .utils import (
find_sections_recursive,
get_generic_section_as_list,
dict_to_uuid,
get_generic_section_as_dict
get_generic_section_as_dict,
persist
)
from .file_reference import get_file_reference_by_study_component
import bia_ingest_sm.conversion.biosample as biosample_conversion
import bia_ingest_sm.conversion.study as study_conversion
from ..biostudies import (
Submission,
attributes_to_dict,
find_file_lists_in_submission,
flist_from_flist_fname,
file_uri,
)
from ..config import settings
from src.bia_models import bia_data_model, semantic_models
from bia_shared_datamodels import bia_data_model, semantic_models

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -58,9 +56,6 @@ def get_experimental_imaging_dataset(
for section in study_components:
attr_dict = attributes_to_dict(section.attributes)
key_mapping = [
("biosample", "Biosample", None,),
("specimen", "Specimen", None,),
("image_acquisition", "Image acquisition", None,),
("image_analysis", "Image analysis", None,),
("image_correlation", "Image correlation", None,),
]
Expand All @@ -69,19 +64,18 @@ def get_experimental_imaging_dataset(
)

analysis_method_list = []
biosample_list = []
image_acquisition_method_list = []
correlation_method_list = []
specimen_preparation_method_list = []
biosample_list = []

#TODO: move this to main CLI code to make object generation more independent
if len(associations) > 0:
# Image Analysis Method
analysis_methods_from_associations = [
a.get("image_analysis") for a in associations
]
for analysis_method in analysis_method_dict.values():
if (
analysis_method.method_description
analysis_method.protocol_description
in analysis_methods_from_associations
):
analysis_method_list.append(analysis_method)
Expand All @@ -92,44 +86,23 @@ def get_experimental_imaging_dataset(
if biosample in biosamples_in_submission_uuid:
biosample_list.extend(biosamples_in_submission_uuid[biosample])


section_name = attr_dict["Name"]
study_component_file_references = file_reference_uuids.get(section_name, [])
model_dict = {
"title_id": section_name,
# "description": attr_dict["Description"],
"submitted_in_study": study_conversion.get_study_uuid(submission),
"file": study_component_file_references,
"image": [],
"specimen_preparation_method": specimen_preparation_method_list,
"acquisition_method": image_acquisition_method_list,
"biological_entity": biosample_list,
"description": attr_dict["Description"],
"submitted_in_study_uuid": study_conversion.get_study_uuid(submission),
"analysis_method": analysis_method_list,
"correlation_method": correlation_method_list,
"file_reference_count": len(study_component_file_references),
"image_count": 0,
"example_image_uri": [],
}
# TODO: Add 'description' to computation of uuid (Maybe accno?)
model_dict["uuid"] = dict_to_uuid(
model_dict, ["title_id", "submitted_in_study",]
)
model_dict["uuid"] = generate_experimental_imaging_dataset_uuid(model_dict)
experimental_imaging_dataset.append(
bia_data_model.ExperimentalImagingDataset.model_validate(model_dict)
)

if persist_artefacts and experimental_imaging_dataset:
output_dir = (
Path(settings.bia_data_dir)
/ "experimental_imaging_datasets"
/ submission.accno
)
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")
for dataset in experimental_imaging_dataset:
output_path = output_dir / f"{dataset.uuid}.json"
output_path.write_text(dataset.model_dump_json(indent=2))
logger.info(f"Written {output_path}")
persist(experimental_imaging_dataset, "experimental_imaging_dataset", submission.accno)

return experimental_imaging_dataset

Expand All @@ -139,7 +112,7 @@ def get_image_analysis_method(
) -> Dict[str, semantic_models.ImageAnalysisMethod]:

key_mapping = [
("method_description", "Title", None,),
("protocol_description", "Title", None,),
("features_analysed", "Image analysis overview", None,),
]

Expand All @@ -151,48 +124,10 @@ def get_image_analysis_method(
)


def get_file_reference_by_study_component(
submission: Submission, persist_artefacts: bool = False
) -> Dict[str, List[bia_data_model.FileReference]]:
"""
Return Dict of list of file references in study components.
"""
file_list_dicts = find_file_lists_in_submission(submission)
fileref_to_study_components = {}

if persist_artefacts:
output_dir = Path(settings.bia_data_dir) / "file_references" / submission.accno
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")

for file_list_dict in file_list_dicts:
study_component_name = file_list_dict["Name"]
if study_component_name not in fileref_to_study_components:
fileref_to_study_components[study_component_name] = []

fname = file_list_dict["File List"]
files_in_fl = flist_from_flist_fname(submission.accno, fname)
for f in files_in_fl:
file_dict = {
"accession_id": submission.accno,
"file_name": str(f.path),
"size_in_bytes": str(f.size),
}
fileref_uuid = dict_to_uuid(
file_dict, ["accession_id", "file_name", "size_in_bytes"]
)
fileref_to_study_components[study_component_name].append(fileref_uuid)
# TODO - Not storing submission_dataset uuid yet!!!
if persist_artefacts:
file_dict["uuid"] = fileref_uuid
file_dict["uri"] = file_uri(submission.accno, f)
file_dict["submission_dataset"] = fileref_uuid
file_dict["format"] = f.type
file_dict["attribute"] = attributes_to_dict(f.attributes)
file_reference = bia_data_model.FileReference.model_validate(file_dict)
output_path = output_dir / f"{fileref_uuid}.json"
output_path.write_text(file_reference.model_dump_json(indent=2))
logger.info(f"Written {output_path}")

return fileref_to_study_components
def generate_experimental_imaging_dataset_uuid(experimental_imaging_dataset_dict: Dict[str, Any]) -> str:
# TODO: Add 'description' to computation of uuid (Maybe accno?)
attributes_to_consider = [
"title_id",
"submitted_in_study_uuid",
]
return dict_to_uuid(experimental_imaging_dataset_dict, attributes_to_consider)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With new approach (file_reference points to parent EID) we may have to re-write this function. (see comment on assignment of submission_dataset)

Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import logging
from pathlib import Path
from typing import List, Dict
from .utils import (
dict_to_uuid,
)
from ..biostudies import (
Submission,
attributes_to_dict,
find_file_lists_in_submission,
flist_from_flist_fname,
file_uri,
)
from ..config import settings
from bia_shared_datamodels import bia_data_model

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

def get_file_reference_by_study_component(
submission: Submission, persist_artefacts: bool = False
) -> Dict[str, List[bia_data_model.FileReference]]:
"""
Return Dict of list of file references in study components.
"""
file_list_dicts = find_file_lists_in_submission(submission)
fileref_to_study_components = {}

if persist_artefacts:
output_dir = Path(settings.bia_data_dir) / "file_references" / submission.accno
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")

for file_list_dict in file_list_dicts:
study_component_name = file_list_dict["Name"]
if study_component_name not in fileref_to_study_components:
fileref_to_study_components[study_component_name] = []

fname = file_list_dict["File List"]
files_in_fl = flist_from_flist_fname(submission.accno, fname)
for f in files_in_fl:
file_dict = {
"accession_id": submission.accno,
"file_path": str(f.path),
"size_in_bytes": str(f.size),
}
fileref_uuid = dict_to_uuid(
file_dict, ["accession_id", "file_path", "size_in_bytes"]
)
fileref_to_study_components[study_component_name].append(fileref_uuid)
# TODO - Not storing submission_dataset uuid yet!!!
if persist_artefacts:
file_dict["uuid"] = fileref_uuid
file_dict["uri"] = file_uri(submission.accno, f)
file_dict["submission_dataset"] = fileref_uuid
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was just a place holder - we need to pass the actual submission_dataset uuid (especially as this will now be the only link to its parent)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've not touched the file_reference code. That wasn't the intent of this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok - I have created a clickup ticket to fix this which is assigned to me.

file_dict["format"] = f.type
file_dict["attribute"] = attributes_to_dict(f.attributes)
file_reference = bia_data_model.FileReference.model_validate(file_dict)
output_path = output_dir / f"{fileref_uuid}.json"
output_path.write_text(file_reference.model_dump_json(indent=2))
logger.info(f"Written {output_path}")

return fileref_to_study_components
Loading
Loading