Skip to content

Commit

Permalink
Save WIP for creating FileReference UUIDs
Browse files Browse the repository at this point in the history
  • Loading branch information
kbab committed Jul 4, 2024
1 parent 34d24b5 commit cf377df
Show file tree
Hide file tree
Showing 6 changed files with 262 additions and 41 deletions.
58 changes: 29 additions & 29 deletions bia-ingest-shared-models/bia_ingest_sm/biostudies.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,35 +212,35 @@ def find_file_lists_in_submission(

# KB 14/06/2024 commented out as I need to replace parse_raw_as with
# TypeAdapter for pydantic >=2
# def flist_from_flist_fname(
# accession_id: str, flist_fname: str, extra_attribute: Union[List[str], str] = None
# ) -> List[File]:
#
# flist_url = FLIST_URI_TEMPLATE.format(
# accession_id=accession_id, flist_fname=flist_fname
# )
#
# r = requests.get(flist_url)
# logger.info(f"Fetching file list from {flist_url}")
# assert r.status_code == 200
#
# # fl = parse_raw_as(List[File], r.content)
# # KB 18/08/2023 - Hack to fix error due to null values in attributes
# # Remove attribute entries with {"value": "null"}
# dict_content = json.loads(r.content)
# dict_filtered_content = filter_filelist_content(dict_content)
# filtered_content = bytes(json.dumps(dict_filtered_content), "utf-8")
# fl = parse_raw_as(List[File], filtered_content)
#
# if extra_attribute:
# if type(extra_attribute) is not list:
# extra_attribute = [
# extra_attribute,
# ]
# for file in fl:
# file.attributes.extend(extra_attribute)
#
# return fl
def flist_from_flist_fname(
accession_id: str, flist_fname: str, extra_attribute: Union[List[str], str] = None
) -> List[File]:

flist_url = FLIST_URI_TEMPLATE.format(
accession_id=accession_id, flist_fname=flist_fname
)

r = requests.get(flist_url)
logger.info(f"Fetching file list from {flist_url}")
assert r.status_code == 200

# fl = parse_raw_as(List[File], r.content)
# KB 18/08/2023 - Hack to fix error due to null values in attributes
# Remove attribute entries with {"value": "null"}
dict_content = json.loads(r.content)
dict_filtered_content = filter_filelist_content(dict_content)
filtered_content = bytes(json.dumps(dict_filtered_content), "utf-8")
fl = parse_raw_as(List[File], filtered_content)

if extra_attribute:
if type(extra_attribute) is not list:
extra_attribute = [
extra_attribute,
]
for file in fl:
file.attributes.extend(extra_attribute)

return fl


def file_uri(
Expand Down
37 changes: 36 additions & 1 deletion bia-ingest-shared-models/bia_ingest_sm/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,45 @@
import uuid
from typing import List, Any, Dict, Optional, Tuple, Type
from pydantic import BaseModel
from .biostudies import Submission, attributes_to_dict, Section, Attribute
from .biostudies import (
Submission,
attributes_to_dict,
Section,
Attribute,
find_file_lists_in_submission,
flist_from_flist_fname,
)
from src.bia_models import bia_data_model, semantic_models


def get_file_reference_by_study_component(
submission: Submission,
) -> Dict[str, List[bia_data_model.FileReference]]:
"""Return Dict of list of file references in study components.
"""
file_list_dicts = find_file_lists_in_submission(submission)
fileref_to_study_components = {}
for file_list_dict in file_list_dicts:
study_component_name = file_list_dict["Name"]
if study_component_name not in fileref_to_study_components:
fileref_to_study_components[study_component_name] = []

fname = file_list_dict["File List"]
files_in_fl = flist_from_flist_fname(submission.accno, fname)
for f in files_in_fl:
file_dict = {
"accession_id": submission.accno,
"file_name": str(f.path),
"size_in_bytes": str(f.size),
}
fileref_uuid = dict_to_uuid(file_dict, ["accession_id", "file_name", "size_in_bytes"])
fileref_to_study_components[study_component_name].append(fileref_uuid)

return fileref_to_study_components


def get_experimental_imaging_dataset(submission: Submission) -> List[bia_data_model.ExperimentalImagingDataset]:
"""Map biostudies.Submission study components to bia_data_model.ExperimentalImagingDataset
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
[
{
"path": "study_component1/im06.png",
"size": 3,
"attributes": [
{
"name": "AnnotationsIn",
"value": "ann06-10.json"
},
{
"name": "metadata1",
"value": "metadata7"
},
{
"name": "metadata2",
"value": "metadata8"
}
],
"type": "file"
},
{
"path": "study_component1/im08.png",
"size": 123,
"attributes": [
{
"name": "AnnotationsIn",
"value": "ann06-10.json"
},
{
"name": "metadata1",
"value": "metadata9"
},
{
"name": "metadata2",
"value": "metadata10"
}
],
"type": "file"
},
{
"path": "study_component1/ann01-05",
"size": 11,
"attributes": [
{
"name": "AnnotationsIn",
"value": "None"
},
{
"name": "metadata1",
"value": "None"
},
{
"name": "metadata2",
"value": "None"
}
],
"type": "directory"
},
{
"path": "study_component1/ann06-10.json",
"size": 12,
"attributes": [
{
"name": "AnnotationsIn",
"value": "None"
},
{
"name": "metadata1",
"value": "None"
},
{
"name": "metadata2",
"value": "None"
}
],
"type": "file"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
[
{
"path": "study_component2/im06.png",
"size": 3,
"attributes": [
{
"name": "AnnotationsIn",
"value": "ann06-10.json"
},
{
"name": "metadata1",
"value": "metadata7"
},
{
"name": "metadata2",
"value": "metadata8"
}
],
"type": "file"
},
{
"path": "study_component2/im08.png",
"size": 123,
"attributes": [
{
"name": "AnnotationsIn",
"value": "ann06-10.json"
},
{
"name": "metadata1",
"value": "metadata9"
},
{
"name": "metadata2",
"value": "metadata10"
}
],
"type": "file"
},
{
"path": "study_component2/ann01-05",
"size": 11,
"attributes": [
{
"name": "AnnotationsIn",
"value": "None"
},
{
"name": "metadata1",
"value": "None"
},
{
"name": "metadata2",
"value": "None"
}
],
"type": "directory"
},
]
66 changes: 56 additions & 10 deletions bia-ingest-shared-models/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,16 +303,36 @@ def get_test_experimental_imaging_dataset() -> (
bia_data_model.ExperimentalImagingDataset
):
# Create first study component
file_references = [{
"accession_id": "S-BIADTEST",
"file_name": "study_component1/im06.png",
"size_in_bytes": 3,
},{
"accession_id": "S-BIADTEST",
"file_name": "study_component1/im08.png",
"size_in_bytes": 123,
},{
"accession_id": "S-BIADTEST",
"file_name": "study_component1/ann01-05",
"size_in_bytes": 11,
},{
"accession_id": "S-BIADTEST",
"file_name": "study_component1/ann06-10.json",
"size_in_bytes": 12,
},
]
file_reference_uuids = get_test_file_reference_uuid(file_references)

experimental_imaging_dataset_dict = {
"title_id": "Study Component 1",
"image": [], # This should be a list of Experimentally captured image UUIDs
"file": [], # This should be a list of FileReference UUIDs ...
"file": file_reference_uuids,
"submitted_in_study": get_test_study().uuid,
"specimen_preparation_method": [
get_template_specimen_preparation_protocol().uuid,
#get_template_specimen_preparation_protocol().uuid,
],
"acquisition_method": [
get_test_image_acquisition()[0].uuid,
#get_test_image_acquisition()[0].uuid,
],
# This study component uses both biosamples
"biological_entity": [
Expand All @@ -322,9 +342,9 @@ def get_test_experimental_imaging_dataset() -> (
get_test_image_analysis_method().model_dump(),
],
"correlation_method": [
get_template_image_correlation_method().model_dump(),
#get_template_image_correlation_method().model_dump(),
],
"file_reference_count": 0,
"file_reference_count": 4,
"image_count": 0,
"example_image_uri": [],
}
Expand All @@ -333,16 +353,32 @@ def get_test_experimental_imaging_dataset() -> (
experimental_imaging_dataset1 = bia_data_model.ExperimentalImagingDataset.model_validate(experimental_imaging_dataset_dict)

# Create second study component
file_references = [{
"accession_id": "S-BIADTEST",
"file_name": "study_component2/im06.png",
"size_in_bytes": 3,
},{
"accession_id": "S-BIADTEST",
"file_name": "study_component2/im08.png",
"size_in_bytes": 123,
},{
"accession_id": "S-BIADTEST",
"file_name": "study_component2/ann01-05",
"size_in_bytes": 11,
},
]
file_reference_uuids = get_test_file_reference_uuid(file_references)

experimental_imaging_dataset_dict = {
"title_id": "Study Component 2",
"image": [], # This should be a list of Experimentally captured image UUIDs
"file": [], # This should be a list of FileReference UUIDs ...
"file": file_reference_uuids,
"submitted_in_study": get_test_study().uuid,
"specimen_preparation_method": [
get_template_specimen_preparation_protocol().uuid,
#get_template_specimen_preparation_protocol().uuid,
],
"acquisition_method": [
get_test_image_acquisition()[1].uuid,
#get_test_image_acquisition()[1].uuid,
],
# This study component uses only second biosample
"biological_entity": [
Expand All @@ -352,9 +388,9 @@ def get_test_experimental_imaging_dataset() -> (
get_test_image_analysis_method().model_dump(),
],
"correlation_method": [
get_template_image_correlation_method().model_dump(),
#get_template_image_correlation_method().model_dump(),
],
"file_reference_count": 0,
"file_reference_count": 3,
"image_count": 0,
"example_image_uri": [],
}
Expand Down Expand Up @@ -567,3 +603,13 @@ def get_test_study() -> bia_data_model.Study:
study_dict["uuid"] = study_uuid
study = bia_data_model.Study.model_validate(study_dict)
return study

def get_test_file_reference_uuid(file_references: List[Dict[str, str]]) -> List[str]:
attributes_to_consider = [
"accession_id",
"file_name",
"size_in_bytes",
]
return [
dict_to_uuid(file_reference, attributes_to_consider) for file_reference in file_references
]
5 changes: 4 additions & 1 deletion bia-shared-datamodels/src/bia_models/semantic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,11 @@ class FileReference(BaseModel):
"""
Information about a file, provided in file list.
"""


# TODO: Clarify if this should be file_name or file_path
file_name: str = Field(description="""The name of the file.""")
# TODO: Clarify if this should be biostudies 'type' or derived from
# file extension
format: str = Field(description="""File format or type.""")
size_in_bytes: int = Field(description="""Disc size in bytes.""")
uri: str = Field(description="""URI from which the file can be accessed.""")
Expand Down

0 comments on commit cf377df

Please sign in to comment.