Skip to content

Commit

Permalink
Create FileReferences for 'annotation' datasets before 'normal' ones (#…
Browse files Browse the repository at this point in the history
…300)

* Create FileReferences for 'annotation' datasets before 'normal' ones

Create file references for 'annotation' datasets before 'normal'
datasets to address issue that file references can only point to one
dataset. Creating those for 'normal' datasets last ensures that if
annotation and normal datasets link to the same file, the file reference
points to the 'normal' dataset.

* Add tests to verify order of creation of datasets

* Refactor as suggested in PR review
  • Loading branch information
kbab authored Feb 7, 2025
1 parent fec017b commit 3bdfe5a
Show file tree
Hide file tree
Showing 6 changed files with 295 additions and 34 deletions.
34 changes: 26 additions & 8 deletions bia-ingest/bia_ingest/biostudies/process_submission_v4.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,37 @@ def process_submission_v4(submission, result_summary, process_files, persister):

datasets = get_dataset(
submission,
study_uuid,
study_uuid,
association_object_dict,
result_summary,
persister=persister,
)

if process_files:
get_file_reference_by_dataset(
submission,
study_uuid,
datasets,
result_summary,
persister=persister,
)
# Currently (03/02/2025) Image conversion does not create images for annotation datasets
# In cases where a user has provided confusing connections between their files and datasets
# i.e. through multiple datasets having file lists which reference the same file (often re-using the same file list)
# we would prefer the created file reference to link to non-annotation dataset, since we can then use them for image conversion.
# By processing file lists from Study Component sections last, we overwrite the submission_dataset_uuid of any duplicate created file reference.
# We do not have a particular preference order between datasets that were all created from Study Components.
for datasets_key, datasets_value in datasets.items():
if datasets_key != "from_study_component" and datasets.get(datasets_key):
get_file_reference_by_dataset(
submission,
study_uuid,
datasets[datasets_key],
result_summary,
persister=persister,
)

if datasets.get("from_study_component"):
get_file_reference_by_dataset(
submission,
study_uuid,
datasets["from_study_component"],
result_summary,
persister=persister,
)

else:
logger.info("Skipping file reference creation.")
33 changes: 17 additions & 16 deletions bia-ingest/bia_ingest/biostudies/v4/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,24 +28,29 @@ def get_dataset(
bsst_title_to_bia_object_map: dict,
result_summary: dict,
persister: Optional[PersistenceStrategy] = None,
) -> List[bia_data_model.Dataset]:

dataset = []
dataset += get_dataset_dict_from_study_component(
) -> dict:
dataset = {
"from_study_component": [],
"from_annotation": [],
"from_other": [],
}
dataset["from_study_component"] = get_dataset_dict_from_study_component(
submission, study_uuid, bsst_title_to_bia_object_map
)
dataset += get_dataset_dict_from_annotation(
dataset["from_annotation"] = get_dataset_dict_from_annotation(
submission, study_uuid, bsst_title_to_bia_object_map
)

datasets = dicts_to_api_models(
dataset,
bia_data_model.Dataset,
result_summary[submission.accno],
)
datasets = {}
for dataset_type, dataset_value in dataset.items():
datasets[dataset_type] = dicts_to_api_models(
dataset_value,
bia_data_model.Dataset,
result_summary[submission.accno],
)

if persister and datasets:
persister.persist(datasets)
if persister and datasets.get(dataset_type):
persister.persist(datasets[dataset_type])

return datasets

Expand All @@ -55,7 +60,6 @@ def get_dataset_dict_from_study_component(
study_uuid: UUID,
bsst_title_to_bia_object_map: dict,
) -> list[dict]:

study_components = find_sections_recursive(submission.section, ["Study Component"])

model_dicts = []
Expand Down Expand Up @@ -146,12 +150,10 @@ def get_uuid_attribute_from_associations(
associations: List[Association],
object_map: dict[str, dict[str, bia_data_model.DocumentMixin]],
):

bio_sample_uuids = []
specimen_prepartion_protocol_uuids = []
image_acquisition_uuids = []
for association in associations:

if association.biosample:
biosample_with_gp_key = association.biosample + "." + association.specimen
if biosample_with_gp_key in object_map["bio_sample"]:
Expand Down Expand Up @@ -267,7 +269,6 @@ def store_annotation_method_in_attribute(
def get_image_analysis_method_from_associations(
associations: List[Association], object_map: dict[str, dict]
):

image_analysis = []
for association_key, method_object in object_map["image_analysis_method"].items():
add_object = False
Expand Down
3 changes: 1 addition & 2 deletions bia-ingest/test/test_bia_ingest_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@


@pytest.fixture
def expected_objects() -> tuple[dict, dict]:
def expected_objects() -> tuple[dict, int]:
datasets = mock_dataset.get_dataset()
expected_objects_dict = {
"study": [mock_study.get_study()],
Expand Down Expand Up @@ -117,7 +117,6 @@ def test_cli_find_test_study(
mock_search_result,
expected_objects,
):

outfile = tmp_path.absolute() / "find_output"

result = runner.invoke(
Expand Down
16 changes: 8 additions & 8 deletions bia-ingest/test/test_biostudies_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
mock_image_acquisition_protocol,
mock_annotation_method,
mock_image_analysis_method,
mock_image_correlation_method,
mock_object_constants,
)
from bia_ingest.biostudies.common import study
Expand All @@ -27,9 +26,7 @@ def test_create_models_specimen_imaging_preparation_protocol(
test_submission,
ingestion_result_summary,
):
expected = (
mock_specimen_imaging_preparation_protocol.get_specimen_imaging_preparation_protocol_as_map()
)
expected = mock_specimen_imaging_preparation_protocol.get_specimen_imaging_preparation_protocol_as_map()
created = specimen_imaging_preparation_protocol.get_specimen_imaging_preparation_protocol_as_map(
test_submission, mock_object_constants.study_uuid, ingestion_result_summary
)
Expand Down Expand Up @@ -120,27 +117,30 @@ def test_create_models_dataset(
ingestion_result_summary,
):
expected = mock_dataset.get_dataset()
created = dataset.get_dataset(
created_dict = dataset.get_dataset(
test_submission,
mock_object_constants.study_uuid,
association_dict,
ingestion_result_summary,
)
created = []
for created_dataset in created_dict.values():
created.extend(created_dataset)

assert expected == created


def test_create_models_dataset_biostudies_default(
test_submission_biostudies_default_direct_files,
ingestion_result_summary_biostudies_default,
test_submission_biostudies_default_direct_files,
ingestion_result_summary_biostudies_default,
):
expected = mock_dataset.get_dataset_biostudies_default()
created = default_dataset.get_dataset_overview(
test_submission_biostudies_default_direct_files,
mock_object_constants.study_uuid_biostudies_default,
ingestion_result_summary_biostudies_default,
)

assert expected == created


Expand Down
181 changes: 181 additions & 0 deletions bia-ingest/test/test_order_of_processing_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
from bia_ingest.biostudies.process_submission_v4 import process_submission_v4
from bia_ingest.biostudies import api
from bia_shared_datamodels import bia_data_model

import pytest
from bia_test_data.mock_objects import (
mock_dataset,
mock_file_reference,
)

from bia_ingest.persistence_strategy import (
persistence_strategy_factory,
PersistenceStrategy,
)


def _modify_annotation_file_list(
submission: api.Submission, file_list_path: str
) -> api.Submission:
"""Modify the path of the file list in an 'annotation' study component"""

# Assumes there is only one annotation section and uses the first one found
copy_of_submission = api.Submission.model_validate(submission.model_dump())
annotation_section = next(
s for s in copy_of_submission.section.subsections if s.type == "Annotations"
)
attribute = next(a for a in annotation_section.attributes if a.name == "File List")
attribute.value = file_list_path
return copy_of_submission


@pytest.fixture(scope="function")
def persister(test_submission, tmp_path) -> PersistenceStrategy:
persister = persistence_strategy_factory(
persistence_mode="disk",
output_dir_base=str(tmp_path),
accession_id=test_submission.accno,
)
return persister


@pytest.fixture
def submission_where_same_file_list_is_used_in_annotation_and_a_study_component(
test_submission,
) -> api.Submission:
return _modify_annotation_file_list(
test_submission, "file_list_study_component_1.json"
)


@pytest.fixture
def submission_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl(
test_submission,
) -> api.Submission:
# file_list_annotations_and_images has one image from study component 1 and one from study component2
return _modify_annotation_file_list(
test_submission, "file_list_annotations_and_images.json"
)


@pytest.fixture
def only_study_component_file_references(
test_submission,
mock_request_get,
) -> dict:
datasets = mock_dataset.get_dataset()

ds_study_component_1 = datasets[0]
ds_study_component_2 = datasets[1]

file_references_sc_1 = mock_file_reference.get_file_reference(
{
ds_study_component_1.uuid: "biad_v4/file_list_study_component_1.json",
}
)
file_references_sc_2 = mock_file_reference.get_file_reference(
{
ds_study_component_2.uuid: "biad_v4/file_list_study_component_2.json",
},
)

file_references = {f.uuid: f for f in file_references_sc_1}
file_references.update({f.uuid: f for f in file_references_sc_2})

return file_references


@pytest.fixture
def expected_file_references_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl(
test_submission,
mock_request_get,
only_study_component_file_references,
) -> dict:
datasets = mock_dataset.get_dataset()

ds_annotation = datasets[2]

file_references_annotation = mock_file_reference.get_file_reference(
{
ds_annotation.uuid: "biad_v4/file_list_annotations_1.json",
},
)

file_references = only_study_component_file_references.copy()
file_references.update({f.uuid: f for f in file_references_annotation})

return file_references


def test_order_of_processing_datasets_where_same_file_list_is_used_in_annotation_and_a_study_component(
submission_where_same_file_list_is_used_in_annotation_and_a_study_component,
only_study_component_file_references,
ingestion_result_summary,
persister,
tmp_path,
):
"""
Test that when an annotation study component and a standard study component
have the same file list, resulting file references point to the dataset for
the standard study component.
"""

submission = (
submission_where_same_file_list_is_used_in_annotation_and_a_study_component
)
expected_file_references = only_study_component_file_references
process_submission_v4(
submission=submission,
result_summary=ingestion_result_summary,
process_files=True,
persister=persister,
)

# Check expected number of file references written
file_reference_base_path = tmp_path / "file_reference" / submission.accno
created_file_references = [
bia_data_model.FileReference.model_validate_json(f.read_text())
for f in file_reference_base_path.glob("*.json")
]

# Check the file references belong to the expected datasets
assert len(created_file_references) == len(expected_file_references)
for created_file_reference in created_file_references:
expected_file_reference = expected_file_references[created_file_reference.uuid]
assert expected_file_reference == created_file_reference


def test_order_of_processing_datasets_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl(
submission_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl,
expected_file_references_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl,
ingestion_result_summary,
persister,
tmp_path,
):
"""
Test that when the same file is in an annotation study component
and a standard study component, the file reference created points
to the dataset for the standard study component.
"""

submission = submission_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl
expected_file_references = expected_file_references_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl
process_submission_v4(
submission=submission,
result_summary=ingestion_result_summary,
process_files=True,
persister=persister,
)

# Check expected number of file references written
file_reference_base_path = tmp_path / "file_reference" / submission.accno
created_file_references = [
bia_data_model.FileReference.model_validate_json(f.read_text())
for f in file_reference_base_path.glob("*.json")
]

# Check the file references belong to the expected datasets
assert len(created_file_references) == len(expected_file_references)
for created_file_reference in created_file_references:
expected_file_reference = expected_file_references[created_file_reference.uuid]
assert expected_file_reference == created_file_reference
Loading

0 comments on commit 3bdfe5a

Please sign in to comment.