Skip to content

Commit

Permalink
Update file reference (#143)
Browse files Browse the repository at this point in the history
* Modify creation of FileReference objects to account for changes in shared models

* Add test for FileReference objects

* Add warning message for discrepancy in number of datasets

Create a warning message if the number of datasets passed into the
get_file_reference_by_dataset differs from the number of datasets
computed from all the file lists in the submission
  • Loading branch information
kbab authored Aug 9, 2024
1 parent 5d91c7a commit b9d335a
Show file tree
Hide file tree
Showing 15 changed files with 380 additions and 162 deletions.
4 changes: 1 addition & 3 deletions bia-ingest-shared-models/bia_ingest_sm/biostudies.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def find_file_lists_in_section(
for subsection in section.subsections:
subsection_type = type(subsection)
if subsection_type == Section:
find_file_lists_in_section(subsection, flists) # type: ignore
find_file_lists_in_section(subsection, flists) # type: ignore
else:
logger.warning(
f"Not processing subsection as type is {subsection_type}, not 'Section'. Contents={subsection}"
Expand All @@ -210,8 +210,6 @@ def find_file_lists_in_submission(
return find_file_lists_in_section(submission.section, [])


# KB 14/06/2024 commented out as I need to replace parse_raw_as with
# TypeAdapter for pydantic >=2
def flist_from_flist_fname(
accession_id: str, flist_fname: str, extra_attribute: Union[List[str], str] = None
) -> List[File]:
Expand Down
6 changes: 4 additions & 2 deletions bia-ingest-shared-models/bia_ingest_sm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,20 @@
default_output_base = (
f"{Path(os.environ.get('HOME', '')) / '.cache' / 'bia-integrator-data-sm'}"
)


class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=f"{Path(__file__).parent.parent / '.env'}",
env_file_encoding="utf-8",
case_sensitive=False,
#extra="forbid",
# extra="forbid",
)

bia_data_dir: str = Field(default_output_base)


#class Settings:
# class Settings:
# def __init__(self):
# self.bia_data_dir = default_output_base

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
find_sections_recursive,
dict_to_uuid,
persist,
filter_model_dictionary
filter_model_dictionary,
)
from ..biostudies import (
Submission,
Expand All @@ -22,16 +22,20 @@ def get_annotation_method(
) -> List[bia_data_model.AnnotationMethod]:

annotation_method_model_dicts = extract_annotation_method_dicts(submission)
annotation_methods = dicts_to_api_models(annotation_method_model_dicts, bia_data_model.AnnotationMethod)
annotation_methods = dicts_to_api_models(
annotation_method_model_dicts, bia_data_model.AnnotationMethod
)

if persist_artefacts and annotation_methods:
persist(annotation_methods, "annotation_method", submission.accno)

return annotation_methods


def extract_annotation_method_dicts(submission: Submission) -> List[Dict[str, Any]]:
annotation_sections = find_sections_recursive(submission.section, ["Annotations"], [])
annotation_sections = find_sections_recursive(
submission.section, ["Annotations"], []
)

key_mapping = [
("title_id", "Name", ""),
Expand All @@ -54,7 +58,9 @@ def extract_annotation_method_dicts(submission: Submission) -> List[Dict[str, An
model_dict["accession_id"] = submission.accno
model_dict["uuid"] = generate_annotation_method_uuid(model_dict)
model_dict["version"] = 1
model_dict = filter_model_dictionary(model_dict, bia_data_model.AnnotationMethod)
model_dict = filter_model_dictionary(
model_dict, bia_data_model.AnnotationMethod
)

model_dicts.append(model_dict)

Expand All @@ -70,6 +76,6 @@ def generate_annotation_method_uuid(protocol_dict: Dict[str, Any]) -> str:
"annotation_criteria",
"annotation_coverage",
"method_type",
"source_dataset"
"source_dataset",
]
return dict_to_uuid(protocol_dict, attributes_to_consider)
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
find_sections_recursive,
dict_to_uuid,
persist,
filter_model_dictionary
filter_model_dictionary,
)
from ..biostudies import (
Submission,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
dict_to_uuid,
get_generic_section_as_dict,
persist,
filter_model_dictionary
filter_model_dictionary,
)
from .file_reference import get_file_reference_by_study_component
import bia_ingest_sm.conversion.biosample as biosample_conversion
import bia_ingest_sm.conversion.study as study_conversion
from ..biostudies import (
Expand All @@ -33,10 +32,6 @@ def get_experimental_imaging_dataset(
)
analysis_method_dict = get_image_analysis_method(submission)

file_reference_uuids = get_file_reference_by_study_component(
submission, persist_artefacts=persist_artefacts
)

# TODO: Need to persist this (API finally, but initially to disk)
biosamples_in_submission = biosample_conversion.get_biosample(submission)

Expand All @@ -45,7 +40,9 @@ def get_experimental_imaging_dataset(
# Use for loop instead of dict comprehension to allow biosamples with
# same title to form list
biosamples_in_submission_uuid = {}
for biosample in biosample_conversion.get_biosample(submission, persist_artefacts=persist_artefacts):
for biosample in biosample_conversion.get_biosample(
submission, persist_artefacts=persist_artefacts
):
if biosample.title_id in biosamples_in_submission_uuid:
biosamples_in_submission_uuid[biosample.title_id].append(biosample.uuid)
else:
Expand All @@ -71,7 +68,7 @@ def get_experimental_imaging_dataset(
correlation_method_list = []
biosample_list = []

#TODO: move this to main CLI code to make object generation more independent
# TODO: move this to main CLI code to make object generation more independent
if len(associations) > 0:
# Image Analysis Method
analysis_methods_from_associations = [
Expand All @@ -90,7 +87,6 @@ def get_experimental_imaging_dataset(
if biosample in biosamples_in_submission_uuid:
biosample_list.extend(biosamples_in_submission_uuid[biosample])


section_name = attr_dict["Name"]
model_dict = {
"title_id": section_name,
Expand All @@ -100,20 +96,24 @@ def get_experimental_imaging_dataset(
"correlation_method": correlation_method_list,
"example_image_uri": [],
"version": 1,
"attribute": {
"associations": associations
}
"attribute": {"associations": associations},
}
model_dict["uuid"] = generate_experimental_imaging_dataset_uuid(model_dict)

model_dict = filter_model_dictionary(model_dict, bia_data_model.ExperimentalImagingDataset)
model_dict = filter_model_dictionary(
model_dict, bia_data_model.ExperimentalImagingDataset
)

experimental_imaging_dataset.append(
bia_data_model.ExperimentalImagingDataset.model_validate(model_dict)
)

if persist_artefacts and experimental_imaging_dataset:
persist(experimental_imaging_dataset, "experimental_imaging_dataset", submission.accno)
persist(
experimental_imaging_dataset,
"experimental_imaging_dataset",
submission.accno,
)

return experimental_imaging_dataset

Expand All @@ -135,10 +135,12 @@ def get_image_analysis_method(
)


def generate_experimental_imaging_dataset_uuid(experimental_imaging_dataset_dict: Dict[str, Any]) -> str:
def generate_experimental_imaging_dataset_uuid(
experimental_imaging_dataset_dict: Dict[str, Any]
) -> str:
# TODO: Add 'description' to computation of uuid (Maybe accno?)
attributes_to_consider = [
"title_id",
"title_id",
"submitted_in_study_uuid",
]
return dict_to_uuid(experimental_imaging_dataset_dict, attributes_to_consider)
127 changes: 92 additions & 35 deletions bia-ingest-shared-models/bia_ingest_sm/conversion/file_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .utils import (
dict_to_uuid,
filter_model_dictionary,
find_datasets_with_file_lists,
)
from ..biostudies import (
Submission,
Expand All @@ -12,56 +13,112 @@
flist_from_flist_fname,
file_uri,
)
from .. import biostudies # To make reference to biostudies.File explicit
from ..config import settings
from bia_shared_datamodels import bia_data_model

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

def get_file_reference_by_study_component(
submission: Submission, persist_artefacts: bool = False

def get_file_reference_by_dataset(
submission: Submission,
datasets_in_submission: List[
bia_data_model.ExperimentalImagingDataset
| bia_data_model.ImageAnnotationDataset
],
persist_artefacts: bool = False,
) -> Dict[str, List[bia_data_model.FileReference]]:
"""
Return Dict of list of file references in study components.
Return Dict of list of file references in datasets.
"""
file_list_dicts = find_file_lists_in_submission(submission)
fileref_to_study_components = {}

# Get datasets to process
titles_from_datasets_in_submission = {
dataset.title_id for dataset in datasets_in_submission
}

file_list_dicts = find_datasets_with_file_lists(submission)

datasets_to_process = {
ds.title_id: ds
for ds in datasets_in_submission
if ds.title_id in file_list_dicts.keys()
}

if not datasets_to_process:
message = f"""
Intersection of titles from datasets in submission ({titles_from_datasets_in_submission}) and file lists in submission ( {file_list_dicts.keys()} ) was null - exiting
"""
logger.warning(message)
return
else:
n_datasets_with_file_lists = len(file_list_dicts.keys())
n_datasets_in_submission = len(datasets_in_submission)
if n_datasets_with_file_lists != n_datasets_in_submission:
message = f"""Number of datasets with file lists ({n_datasets_with_file_lists}) is not equal to the number of datasets passed as input to this function ({n_datasets_in_submission}). Was this deliberate?"""
logger.warning(message)

if persist_artefacts:
output_dir = Path(settings.bia_data_dir) / "file_references" / submission.accno
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.info(f"Created {output_dir}")

for file_list_dict in file_list_dicts:
study_component_name = file_list_dict["Name"]
if study_component_name not in fileref_to_study_components:
fileref_to_study_components[study_component_name] = []

fname = file_list_dict["File List"]
files_in_fl = flist_from_flist_fname(submission.accno, fname)
for f in files_in_fl:
file_dict = {
"accession_id": submission.accno,
"file_path": str(f.path),
"size_in_bytes": str(f.size),
"version": 1
}
fileref_uuid = dict_to_uuid(
file_dict, ["accession_id", "file_path", "size_in_bytes"]
fileref_to_datasets = {}
for dataset_name, dataset in datasets_to_process.items():
for file_list_dict in file_list_dicts[dataset_name]:
if dataset_name not in fileref_to_datasets:
fileref_to_datasets[dataset_name] = []

fname = file_list_dict["File List"]
files_in_fl = flist_from_flist_fname(submission.accno, fname)

file_references = get_file_reference_for_submission_dataset(
submission.accno, dataset, files_in_fl
)
fileref_to_study_components[study_component_name].append(fileref_uuid)
# TODO - Not storing submission_dataset uuid yet!!!

if persist_artefacts:
file_dict["uuid"] = fileref_uuid
file_dict["uri"] = file_uri(submission.accno, f)
file_dict["submission_dataset"] = fileref_uuid
file_dict["format"] = f.type
file_dict["attribute"] = attributes_to_dict(f.attributes)
file_dict = filter_model_dictionary(file_dict, bia_data_model.FileReference)
file_reference = bia_data_model.FileReference.model_validate(file_dict)
output_path = output_dir / f"{fileref_uuid}.json"
output_path.write_text(file_reference.model_dump_json(indent=2))
logger.info(f"Written {output_path}")

return fileref_to_study_components
for file_reference in file_references:
output_path = output_dir / f"{file_reference.uuid}.json"
output_path.write_text(file_reference.model_dump_json(indent=2))
logger.info(f"Written {output_path}")

fileref_to_datasets[dataset_name].extend(file_references)

return fileref_to_datasets


def get_file_reference_for_submission_dataset(
accession_id: str,
submission_dataset: [
bia_data_model.ExperimentalImagingDataset
| bia_data_model.ImageAnnotationDataset
],
files_in_file_list: List[biostudies.File],
) -> List[bia_data_model.FileReference]:
"""
Return list of file references for particular submission dataset
"""

file_references = []
for f in files_in_file_list:
file_dict = {
"accession_id": accession_id,
"file_path": str(f.path),
"size_in_bytes": str(f.size),
}
fileref_uuid = dict_to_uuid(
file_dict, ["accession_id", "file_path", "size_in_bytes"]
)
file_dict["uuid"] = fileref_uuid
file_dict["uri"] = file_uri(accession_id, f)
file_dict["submission_dataset_uuid"] = submission_dataset.uuid
file_dict["format"] = f.type
file_dict["attribute"] = attributes_to_dict(f.attributes)
file_dict["version"] = 1
file_dict = filter_model_dictionary(file_dict, bia_data_model.FileReference)
file_reference = bia_data_model.FileReference.model_validate(file_dict)
file_references.append(file_reference)

return file_references
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,20 @@ def get_image_acquisition(
) -> List[bia_data_model.ImageAcquisition]:

image_acquisition_model_dicts = extract_image_acquisition_dicts(submission)
image_acquisitions = dicts_to_api_models(image_acquisition_model_dicts, bia_data_model.ImageAcquisition)
image_acquisitions = dicts_to_api_models(
image_acquisition_model_dicts, bia_data_model.ImageAcquisition
)

if persist_artefacts and image_acquisitions:
persist(image_acquisitions, "specimen_growth_protocol", submission.accno)

return image_acquisitions


def extract_image_acquisition_dicts(submission: Submission) -> List[Dict[str, Any]]:
acquisition_sections = find_sections_recursive(submission.section, ["Image acquisition"], [])
acquisition_sections = find_sections_recursive(
submission.section, ["Image acquisition"], []
)

key_mapping = [
("title_id", "Title", ""),
Expand All @@ -53,7 +57,9 @@ def extract_image_acquisition_dicts(submission: Submission) -> List[Dict[str, An
model_dict["accession_id"] = submission.accno
model_dict["uuid"] = generate_image_acquisition_uuid(model_dict)
model_dict["version"] = 1
model_dict = filter_model_dictionary(model_dict, bia_data_model.ImageAcquisition)
model_dict = filter_model_dictionary(
model_dict, bia_data_model.ImageAcquisition
)
model_dicts.append(model_dict)

return model_dicts
Expand All @@ -67,6 +73,6 @@ def generate_image_acquisition_uuid(protocol_dict: Dict[str, Any]) -> str:
"protocol_description",
"imaging_instrument_description",
"imaging_method_name",
"fbbi_id"
"fbbi_id",
]
return dict_to_uuid(protocol_dict, attributes_to_consider)
Loading

0 comments on commit b9d335a

Please sign in to comment.