diff --git a/bia-ingest-shared-models/bia_ingest_sm/biostudies.py b/bia-ingest-shared-models/bia_ingest_sm/biostudies.py index 5e61f911..b2db0efe 100644 --- a/bia-ingest-shared-models/bia_ingest_sm/biostudies.py +++ b/bia-ingest-shared-models/bia_ingest_sm/biostudies.py @@ -212,35 +212,35 @@ def find_file_lists_in_submission( # KB 14/06/2024 commented out as I need to replace parse_raw_as with # TypeAdapter for pydantic >=2 -# def flist_from_flist_fname( -# accession_id: str, flist_fname: str, extra_attribute: Union[List[str], str] = None -# ) -> List[File]: -# -# flist_url = FLIST_URI_TEMPLATE.format( -# accession_id=accession_id, flist_fname=flist_fname -# ) -# -# r = requests.get(flist_url) -# logger.info(f"Fetching file list from {flist_url}") -# assert r.status_code == 200 -# -# # fl = parse_raw_as(List[File], r.content) -# # KB 18/08/2023 - Hack to fix error due to null values in attributes -# # Remove attribute entries with {"value": "null"} -# dict_content = json.loads(r.content) -# dict_filtered_content = filter_filelist_content(dict_content) -# filtered_content = bytes(json.dumps(dict_filtered_content), "utf-8") -# fl = parse_raw_as(List[File], filtered_content) -# -# if extra_attribute: -# if type(extra_attribute) is not list: -# extra_attribute = [ -# extra_attribute, -# ] -# for file in fl: -# file.attributes.extend(extra_attribute) -# -# return fl +def flist_from_flist_fname( + accession_id: str, flist_fname: str, extra_attribute: Union[List[str], str] = None +) -> List[File]: + + flist_url = FLIST_URI_TEMPLATE.format( + accession_id=accession_id, flist_fname=flist_fname + ) + + r = requests.get(flist_url) + logger.info(f"Fetching file list from {flist_url}") + assert r.status_code == 200 + + # fl = parse_raw_as(List[File], r.content) + # KB 18/08/2023 - Hack to fix error due to null values in attributes + # Remove attribute entries with {"value": "null"} + dict_content = json.loads(r.content) + dict_filtered_content = filter_filelist_content(dict_content) + filtered_content = bytes(json.dumps(dict_filtered_content), "utf-8") + fl = parse_raw_as(List[File], filtered_content) + + if extra_attribute: + if type(extra_attribute) is not list: + extra_attribute = [ + extra_attribute, + ] + for file in fl: + file.attributes.extend(extra_attribute) + + return fl def file_uri( diff --git a/bia-ingest-shared-models/bia_ingest_sm/conversion.py b/bia-ingest-shared-models/bia_ingest_sm/conversion.py index fd8ee33b..2ddb6e7f 100644 --- a/bia-ingest-shared-models/bia_ingest_sm/conversion.py +++ b/bia-ingest-shared-models/bia_ingest_sm/conversion.py @@ -3,10 +3,45 @@ import uuid from typing import List, Any, Dict, Optional, Tuple, Type from pydantic import BaseModel -from .biostudies import Submission, attributes_to_dict, Section, Attribute +from .biostudies import ( + Submission, + attributes_to_dict, + Section, + Attribute, + find_file_lists_in_submission, + flist_from_flist_fname, +) from src.bia_models import bia_data_model, semantic_models +def get_file_reference_by_study_component( + submission: Submission, + ) -> Dict[str, List[bia_data_model.FileReference]]: + """Return Dict of list of file references in study components. + + + """ + file_list_dicts = find_file_lists_in_submission(submission) + fileref_to_study_components = {} + for file_list_dict in file_list_dicts: + study_component_name = file_list_dict["Name"] + if study_component_name not in fileref_to_study_components: + fileref_to_study_components[study_component_name] = [] + + fname = file_list_dict["File List"] + files_in_fl = flist_from_flist_fname(submission.accno, fname) + for f in files_in_fl: + file_dict = { + "accession_id": submission.accno, + "file_name": str(f.path), + "size_in_bytes": str(f.size), + } + fileref_uuid = dict_to_uuid(file_dict, ["accession_id", "file_name", "size_in_bytes"]) + fileref_to_study_components[study_component_name].append(fileref_uuid) + + return fileref_to_study_components + + def get_experimental_imaging_dataset(submission: Submission) -> List[bia_data_model.ExperimentalImagingDataset]: """Map biostudies.Submission study components to bia_data_model.ExperimentalImagingDataset diff --git a/bia-ingest-shared-models/test/data/file_list_study_component_1.json b/bia-ingest-shared-models/test/data/file_list_study_component_1.json new file mode 100644 index 00000000..06b530f1 --- /dev/null +++ b/bia-ingest-shared-models/test/data/file_list_study_component_1.json @@ -0,0 +1,78 @@ +[ + { + "path": "study_component1/im06.png", + "size": 3, + "attributes": [ + { + "name": "AnnotationsIn", + "value": "ann06-10.json" + }, + { + "name": "metadata1", + "value": "metadata7" + }, + { + "name": "metadata2", + "value": "metadata8" + } + ], + "type": "file" + }, + { + "path": "study_component1/im08.png", + "size": 123, + "attributes": [ + { + "name": "AnnotationsIn", + "value": "ann06-10.json" + }, + { + "name": "metadata1", + "value": "metadata9" + }, + { + "name": "metadata2", + "value": "metadata10" + } + ], + "type": "file" + }, + { + "path": "study_component1/ann01-05", + "size": 11, + "attributes": [ + { + "name": "AnnotationsIn", + "value": "None" + }, + { + "name": "metadata1", + "value": "None" + }, + { + "name": "metadata2", + "value": "None" + } + ], + "type": "directory" + }, + { + "path": "study_component1/ann06-10.json", + "size": 12, + "attributes": [ + { + "name": "AnnotationsIn", + "value": "None" + }, + { + "name": "metadata1", + "value": "None" + }, + { + "name": "metadata2", + "value": "None" + } + ], + "type": "file" + } +] diff --git a/bia-ingest-shared-models/test/data/file_list_study_component_2.json b/bia-ingest-shared-models/test/data/file_list_study_component_2.json new file mode 100644 index 00000000..cd9a328d --- /dev/null +++ b/bia-ingest-shared-models/test/data/file_list_study_component_2.json @@ -0,0 +1,59 @@ +[ + { + "path": "study_component2/im06.png", + "size": 3, + "attributes": [ + { + "name": "AnnotationsIn", + "value": "ann06-10.json" + }, + { + "name": "metadata1", + "value": "metadata7" + }, + { + "name": "metadata2", + "value": "metadata8" + } + ], + "type": "file" + }, + { + "path": "study_component2/im08.png", + "size": 123, + "attributes": [ + { + "name": "AnnotationsIn", + "value": "ann06-10.json" + }, + { + "name": "metadata1", + "value": "metadata9" + }, + { + "name": "metadata2", + "value": "metadata10" + } + ], + "type": "file" + }, + { + "path": "study_component2/ann01-05", + "size": 11, + "attributes": [ + { + "name": "AnnotationsIn", + "value": "None" + }, + { + "name": "metadata1", + "value": "None" + }, + { + "name": "metadata2", + "value": "None" + } + ], + "type": "directory" + }, +] diff --git a/bia-ingest-shared-models/test/utils.py b/bia-ingest-shared-models/test/utils.py index 3436974c..5e292da1 100644 --- a/bia-ingest-shared-models/test/utils.py +++ b/bia-ingest-shared-models/test/utils.py @@ -303,16 +303,36 @@ def get_test_experimental_imaging_dataset() -> ( bia_data_model.ExperimentalImagingDataset ): # Create first study component + file_references = [{ + "accession_id": "S-BIADTEST", + "file_name": "study_component1/im06.png", + "size_in_bytes": 3, + },{ + "accession_id": "S-BIADTEST", + "file_name": "study_component1/im08.png", + "size_in_bytes": 123, + },{ + "accession_id": "S-BIADTEST", + "file_name": "study_component1/ann01-05", + "size_in_bytes": 11, + },{ + "accession_id": "S-BIADTEST", + "file_name": "study_component1/ann06-10.json", + "size_in_bytes": 12, + }, + ] + file_reference_uuids = get_test_file_reference_uuid(file_references) + experimental_imaging_dataset_dict = { "title_id": "Study Component 1", "image": [], # This should be a list of Experimentally captured image UUIDs - "file": [], # This should be a list of FileReference UUIDs ... + "file": file_reference_uuids, "submitted_in_study": get_test_study().uuid, "specimen_preparation_method": [ - get_template_specimen_preparation_protocol().uuid, + #get_template_specimen_preparation_protocol().uuid, ], "acquisition_method": [ - get_test_image_acquisition()[0].uuid, + #get_test_image_acquisition()[0].uuid, ], # This study component uses both biosamples "biological_entity": [ @@ -322,9 +342,9 @@ def get_test_experimental_imaging_dataset() -> ( get_test_image_analysis_method().model_dump(), ], "correlation_method": [ - get_template_image_correlation_method().model_dump(), + #get_template_image_correlation_method().model_dump(), ], - "file_reference_count": 0, + "file_reference_count": 4, "image_count": 0, "example_image_uri": [], } @@ -333,16 +353,32 @@ def get_test_experimental_imaging_dataset() -> ( experimental_imaging_dataset1 = bia_data_model.ExperimentalImagingDataset.model_validate(experimental_imaging_dataset_dict) # Create second study component + file_references = [{ + "accession_id": "S-BIADTEST", + "file_name": "study_component2/im06.png", + "size_in_bytes": 3, + },{ + "accession_id": "S-BIADTEST", + "file_name": "study_component2/im08.png", + "size_in_bytes": 123, + },{ + "accession_id": "S-BIADTEST", + "file_name": "study_component2/ann01-05", + "size_in_bytes": 11, + }, + ] + file_reference_uuids = get_test_file_reference_uuid(file_references) + experimental_imaging_dataset_dict = { "title_id": "Study Component 2", "image": [], # This should be a list of Experimentally captured image UUIDs - "file": [], # This should be a list of FileReference UUIDs ... + "file": file_reference_uuids, "submitted_in_study": get_test_study().uuid, "specimen_preparation_method": [ - get_template_specimen_preparation_protocol().uuid, + #get_template_specimen_preparation_protocol().uuid, ], "acquisition_method": [ - get_test_image_acquisition()[1].uuid, + #get_test_image_acquisition()[1].uuid, ], # This study component uses only second biosample "biological_entity": [ @@ -352,9 +388,9 @@ def get_test_experimental_imaging_dataset() -> ( get_test_image_analysis_method().model_dump(), ], "correlation_method": [ - get_template_image_correlation_method().model_dump(), + #get_template_image_correlation_method().model_dump(), ], - "file_reference_count": 0, + "file_reference_count": 3, "image_count": 0, "example_image_uri": [], } @@ -567,3 +603,13 @@ def get_test_study() -> bia_data_model.Study: study_dict["uuid"] = study_uuid study = bia_data_model.Study.model_validate(study_dict) return study + +def get_test_file_reference_uuid(file_references: List[Dict[str, str]]) -> List[str]: + attributes_to_consider = [ + "accession_id", + "file_name", + "size_in_bytes", + ] + return [ + dict_to_uuid(file_reference, attributes_to_consider) for file_reference in file_references + ] diff --git a/bia-shared-datamodels/src/bia_models/semantic_models.py b/bia-shared-datamodels/src/bia_models/semantic_models.py index b7432ae0..c59de3da 100644 --- a/bia-shared-datamodels/src/bia_models/semantic_models.py +++ b/bia-shared-datamodels/src/bia_models/semantic_models.py @@ -228,8 +228,11 @@ class FileReference(BaseModel): """ Information about a file, provided in file list. """ - + + # TODO: Clarify if this should be file_name or file_path file_name: str = Field(description="""The name of the file.""") + # TODO: Clarify if this should be biostudies 'type' or derived from + # file extension format: str = Field(description="""File format or type.""") size_in_bytes: int = Field(description="""Disc size in bytes.""") uri: str = Field(description="""URI from which the file can be accessed.""")