Create FileReferences for 'annotation' datasets before 'normal' ones (#…

…300) * Create FileReferences for 'annotation' datasets before 'normal' ones Create file references for 'annotation' datasets before 'normal' datasets to address issue that file references can only point to one dataset. Creating those for 'normal' datasets last ensures that if annotation and normal datasets link to the same file, the file reference points to the 'normal' dataset. * Add tests to verify order of creation of datasets * Refactor as suggested in PR review
BioImage-Archive · Feb 7, 2025 · 3bdfe5a · 3bdfe5a
1 parent fec017b
commit 3bdfe5a
Show file tree

Hide file tree

Showing 6 changed files with 295 additions and 34 deletions.
diff --git a/bia-ingest/bia_ingest/biostudies/process_submission_v4.py b/bia-ingest/bia_ingest/biostudies/process_submission_v4.py
@@ -57,19 +57,37 @@ def process_submission_v4(submission, result_summary, process_files, persister):
 
     datasets = get_dataset(
         submission,
-        study_uuid, 
+        study_uuid,
         association_object_dict,
         result_summary,
         persister=persister,
     )
 
     if process_files:
-        get_file_reference_by_dataset(
-            submission,
-            study_uuid, 
-            datasets,
-            result_summary,
-            persister=persister,
-        )
+        # Currently (03/02/2025) Image conversion does not create images for annotation datasets
+        # In cases where a user has provided confusing connections between their files and datasets
+        # i.e. through multiple datasets having file lists which reference the same file (often re-using the same file list)
+        # we would prefer the created file reference to link to non-annotation dataset, since we can then use them for image conversion.
+        # By processing file lists from Study Component sections last, we overwrite the submission_dataset_uuid of any duplicate created file reference.
+        # We do not have a particular preference order between datasets that were all created from Study Components.
+        for datasets_key, datasets_value in datasets.items():
+            if datasets_key != "from_study_component" and datasets.get(datasets_key):
+                get_file_reference_by_dataset(
+                    submission,
+                    study_uuid,
+                    datasets[datasets_key],
+                    result_summary,
+                    persister=persister,
+                )
+
+        if datasets.get("from_study_component"):
+            get_file_reference_by_dataset(
+                submission,
+                study_uuid,
+                datasets["from_study_component"],
+                result_summary,
+                persister=persister,
+            )
+
     else:
         logger.info("Skipping file reference creation.")
diff --git a/bia-ingest/bia_ingest/biostudies/v4/dataset.py b/bia-ingest/bia_ingest/biostudies/v4/dataset.py
@@ -28,24 +28,29 @@ def get_dataset(
     bsst_title_to_bia_object_map: dict,
     result_summary: dict,
     persister: Optional[PersistenceStrategy] = None,
-) -> List[bia_data_model.Dataset]:
-
-    dataset = []
-    dataset += get_dataset_dict_from_study_component(
+) -> dict:
+    dataset = {
+        "from_study_component": [],
+        "from_annotation": [],
+        "from_other": [],
+    }
+    dataset["from_study_component"] = get_dataset_dict_from_study_component(
         submission, study_uuid, bsst_title_to_bia_object_map
     )
-    dataset += get_dataset_dict_from_annotation(
+    dataset["from_annotation"] = get_dataset_dict_from_annotation(
         submission, study_uuid, bsst_title_to_bia_object_map
     )
 
-    datasets = dicts_to_api_models(
-        dataset,
-        bia_data_model.Dataset,
-        result_summary[submission.accno],
-    )
+    datasets = {}
+    for dataset_type, dataset_value in dataset.items():
+        datasets[dataset_type] = dicts_to_api_models(
+            dataset_value,
+            bia_data_model.Dataset,
+            result_summary[submission.accno],
+        )
 
-    if persister and datasets:
-        persister.persist(datasets)
+        if persister and datasets.get(dataset_type):
+            persister.persist(datasets[dataset_type])
 
     return datasets
 
@@ -55,7 +60,6 @@ def get_dataset_dict_from_study_component(
     study_uuid: UUID,
     bsst_title_to_bia_object_map: dict,
 ) -> list[dict]:
-
     study_components = find_sections_recursive(submission.section, ["Study Component"])
 
     model_dicts = []
@@ -146,12 +150,10 @@ def get_uuid_attribute_from_associations(
     associations: List[Association],
     object_map: dict[str, dict[str, bia_data_model.DocumentMixin]],
 ):
-
     bio_sample_uuids = []
     specimen_prepartion_protocol_uuids = []
     image_acquisition_uuids = []
     for association in associations:
-
         if association.biosample:
             biosample_with_gp_key = association.biosample + "." + association.specimen
             if biosample_with_gp_key in object_map["bio_sample"]:
@@ -267,7 +269,6 @@ def store_annotation_method_in_attribute(
 def get_image_analysis_method_from_associations(
     associations: List[Association], object_map: dict[str, dict]
 ):
-
     image_analysis = []
     for association_key, method_object in object_map["image_analysis_method"].items():
         add_object = False

diff --git a/bia-ingest/test/test_bia_ingest_cli.py b/bia-ingest/test/test_bia_ingest_cli.py
@@ -20,7 +20,7 @@
 
 
 @pytest.fixture
-def expected_objects() -> tuple[dict, dict]:
+def expected_objects() -> tuple[dict, int]:
     datasets = mock_dataset.get_dataset()
     expected_objects_dict = {
         "study": [mock_study.get_study()],
@@ -117,7 +117,6 @@ def test_cli_find_test_study(
     mock_search_result,
     expected_objects,
 ):
-
     outfile = tmp_path.absolute() / "find_output"
 
     result = runner.invoke(

diff --git a/bia-ingest/test/test_biostudies_conversion.py b/bia-ingest/test/test_biostudies_conversion.py
@@ -8,7 +8,6 @@
     mock_image_acquisition_protocol,
     mock_annotation_method,
     mock_image_analysis_method,
-    mock_image_correlation_method,
     mock_object_constants,
 )
 from bia_ingest.biostudies.common import study
@@ -27,9 +26,7 @@ def test_create_models_specimen_imaging_preparation_protocol(
     test_submission,
     ingestion_result_summary,
 ):
-    expected = (
-        mock_specimen_imaging_preparation_protocol.get_specimen_imaging_preparation_protocol_as_map()
-    )
+    expected = mock_specimen_imaging_preparation_protocol.get_specimen_imaging_preparation_protocol_as_map()
     created = specimen_imaging_preparation_protocol.get_specimen_imaging_preparation_protocol_as_map(
         test_submission, mock_object_constants.study_uuid, ingestion_result_summary
     )
@@ -120,27 +117,30 @@ def test_create_models_dataset(
     ingestion_result_summary,
 ):
     expected = mock_dataset.get_dataset()
-    created = dataset.get_dataset(
+    created_dict = dataset.get_dataset(
         test_submission,
         mock_object_constants.study_uuid,
         association_dict,
         ingestion_result_summary,
     )
+    created = []
+    for created_dataset in created_dict.values():
+        created.extend(created_dataset)
 
     assert expected == created
 
 
 def test_create_models_dataset_biostudies_default(
-    test_submission_biostudies_default_direct_files, 
-    ingestion_result_summary_biostudies_default, 
+    test_submission_biostudies_default_direct_files,
+    ingestion_result_summary_biostudies_default,
 ):
     expected = mock_dataset.get_dataset_biostudies_default()
     created = default_dataset.get_dataset_overview(
         test_submission_biostudies_default_direct_files,
         mock_object_constants.study_uuid_biostudies_default,
         ingestion_result_summary_biostudies_default,
     )
-    
+
     assert expected == created
 
 

diff --git a/bia-ingest/test/test_order_of_processing_datasets.py b/bia-ingest/test/test_order_of_processing_datasets.py
@@ -0,0 +1,181 @@
+from bia_ingest.biostudies.process_submission_v4 import process_submission_v4
+from bia_ingest.biostudies import api
+from bia_shared_datamodels import bia_data_model
+
+import pytest
+from bia_test_data.mock_objects import (
+    mock_dataset,
+    mock_file_reference,
+)
+
+from bia_ingest.persistence_strategy import (
+    persistence_strategy_factory,
+    PersistenceStrategy,
+)
+
+
+def _modify_annotation_file_list(
+    submission: api.Submission, file_list_path: str
+) -> api.Submission:
+    """Modify the path of the file list in an 'annotation' study component"""
+
+    # Assumes there is only one annotation section and uses the first one found
+    copy_of_submission = api.Submission.model_validate(submission.model_dump())
+    annotation_section = next(
+        s for s in copy_of_submission.section.subsections if s.type == "Annotations"
+    )
+    attribute = next(a for a in annotation_section.attributes if a.name == "File List")
+    attribute.value = file_list_path
+    return copy_of_submission
+
+
+@pytest.fixture(scope="function")
+def persister(test_submission, tmp_path) -> PersistenceStrategy:
+    persister = persistence_strategy_factory(
+        persistence_mode="disk",
+        output_dir_base=str(tmp_path),
+        accession_id=test_submission.accno,
+    )
+    return persister
+
+
+@pytest.fixture
+def submission_where_same_file_list_is_used_in_annotation_and_a_study_component(
+    test_submission,
+) -> api.Submission:
+    return _modify_annotation_file_list(
+        test_submission, "file_list_study_component_1.json"
+    )
+
+
+@pytest.fixture
+def submission_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl(
+    test_submission,
+) -> api.Submission:
+    # file_list_annotations_and_images has one image from study component 1 and one from study component2
+    return _modify_annotation_file_list(
+        test_submission, "file_list_annotations_and_images.json"
+    )
+
+
+@pytest.fixture
+def only_study_component_file_references(
+    test_submission,
+    mock_request_get,
+) -> dict:
+    datasets = mock_dataset.get_dataset()
+
+    ds_study_component_1 = datasets[0]
+    ds_study_component_2 = datasets[1]
+
+    file_references_sc_1 = mock_file_reference.get_file_reference(
+        {
+            ds_study_component_1.uuid: "biad_v4/file_list_study_component_1.json",
+        }
+    )
+    file_references_sc_2 = mock_file_reference.get_file_reference(
+        {
+            ds_study_component_2.uuid: "biad_v4/file_list_study_component_2.json",
+        },
+    )
+
+    file_references = {f.uuid: f for f in file_references_sc_1}
+    file_references.update({f.uuid: f for f in file_references_sc_2})
+
+    return file_references
+
+
+@pytest.fixture
+def expected_file_references_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl(
+    test_submission,
+    mock_request_get,
+    only_study_component_file_references,
+) -> dict:
+    datasets = mock_dataset.get_dataset()
+
+    ds_annotation = datasets[2]
+
+    file_references_annotation = mock_file_reference.get_file_reference(
+        {
+            ds_annotation.uuid: "biad_v4/file_list_annotations_1.json",
+        },
+    )
+
+    file_references = only_study_component_file_references.copy()
+    file_references.update({f.uuid: f for f in file_references_annotation})
+
+    return file_references
+
+
+def test_order_of_processing_datasets_where_same_file_list_is_used_in_annotation_and_a_study_component(
+    submission_where_same_file_list_is_used_in_annotation_and_a_study_component,
+    only_study_component_file_references,
+    ingestion_result_summary,
+    persister,
+    tmp_path,
+):
+    """
+    Test that when an annotation study component and a standard study component
+    have the same file list, resulting file references point to the dataset for
+    the standard study component.
+    """
+
+    submission = (
+        submission_where_same_file_list_is_used_in_annotation_and_a_study_component
+    )
+    expected_file_references = only_study_component_file_references
+    process_submission_v4(
+        submission=submission,
+        result_summary=ingestion_result_summary,
+        process_files=True,
+        persister=persister,
+    )
+
+    # Check expected number of file references written
+    file_reference_base_path = tmp_path / "file_reference" / submission.accno
+    created_file_references = [
+        bia_data_model.FileReference.model_validate_json(f.read_text())
+        for f in file_reference_base_path.glob("*.json")
+    ]
+
+    # Check the file references belong to the expected datasets
+    assert len(created_file_references) == len(expected_file_references)
+    for created_file_reference in created_file_references:
+        expected_file_reference = expected_file_references[created_file_reference.uuid]
+        assert expected_file_reference == created_file_reference
+
+
+def test_order_of_processing_datasets_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl(
+    submission_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl,
+    expected_file_references_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl,
+    ingestion_result_summary,
+    persister,
+    tmp_path,
+):
+    """
+    Test that when the same file is in an annotation study component
+    and a standard study component, the file reference created points
+    to the dataset for the standard study component.
+    """
+
+    submission = submission_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl
+    expected_file_references = expected_file_references_where_same_file_is_referred_to_in_annotation_fl_and_a_study_component_fl
+    process_submission_v4(
+        submission=submission,
+        result_summary=ingestion_result_summary,
+        process_files=True,
+        persister=persister,
+    )
+
+    # Check expected number of file references written
+    file_reference_base_path = tmp_path / "file_reference" / submission.accno
+    created_file_references = [
+        bia_data_model.FileReference.model_validate_json(f.read_text())
+        for f in file_reference_base_path.glob("*.json")
+    ]
+
+    # Check the file references belong to the expected datasets
+    assert len(created_file_references) == len(expected_file_references)
+    for created_file_reference in created_file_references:
+        expected_file_reference = expected_file_references[created_file_reference.uuid]
+        assert expected_file_reference == created_file_reference