Assign images with proposals (#302)

* Add bia_curation to provenance list for attributes * feat: Add propose-files command to CLI for proposing convertible file references * feat: Add propose.py for image assignment functionality * feat: Add assign-from-proposal command to process file references from proposal files * feat: Return image UUID from assign and create default representation in assign_from_proposal * Include use of propose in image assignment * Alter propose function to allow multiple accession IDs * docs: Add propose and process workflow to bia-assign-image README
BioImage-Archive · Feb 7, 2025 · 8337402 · 8337402
1 parent 3bdfe5a
commit 8337402
Show file tree

Hide file tree

Showing 9 changed files with 517 additions and 9 deletions.
diff --git a/bia-assign-image/README.md b/bia-assign-image/README.md
@@ -6,20 +6,40 @@ This sub-package assigns file reference(s) to BIA Image objects and creates imag
 Install the project using poetry.
 
 ## Usage
-This package has 2 cli applications:
+This package has the following CLI commands:
+ * **propose-images**: generate proposals for convertible images from accessions
+ * **assign-from-proposal**: process a proposal file to create images and representations
  * **assign**: used to assign file reference(s) to BIA Image objects
- * **representations**: used to create image representation objects (without conversion of images) from BIA Image objects.
+ * **representations**: used to create image representation objects (without conversion of images) from BIA Image objects
 
 The artefacts created are saved to the API by default. The current version of the cli allows saving
-to disk using the option `--persistence-mode disk` on either command. However, this will be deprecated in
+to disk using the option `--persistence-mode disk` on any command. However, this will be deprecated in
 a future revision.
-## Assigning file reference(s) to BIA Image objects (in other words, create a BIA Image object for those file reference(s))
-To create a BIA Image of a set of file references run:
-``` sh
+
+## Proposing and Processing Images
+The recommended workflow is to first generate proposals for which images to convert:
+
+```sh
+poetry run bia-assign-image propose-images S-BIAD1 proposals.txt --max-items 5
+```
+
+This will analyze the accession and suggest up to 5 file references to convert, writing them to proposals.txt.
+You can specify multiple accession IDs and use --append to add to an existing proposal file.
+
+Then process the proposals to create images and representations:
+
+```sh
+poetry run bia-assign-image assign-from-proposal proposals.txt
+```
+
+This will create BIA Image objects and default representations for each proposed file reference.
+## Manual Assignment
+To directly create a BIA Image from file references without using proposals, run:
+```sh
 poetry run bia-assign-image assign <STUDY ACCESSION ID> <LIST OF FILE REFERENCE UUIDS>
 ```
 E.g. Assuming the study S-BIAD1285 has been ingested:
-```
+```sh
 poetry run bia-assign-image assign S-BIAD1285 b768fb72-7ea2-4b80-b54d-bdf5ca280bfd
 ```
 

diff --git a/bia-assign-image/bia_assign_image/cli.py b/bia-assign-image/bia_assign_image/cli.py
@@ -1,4 +1,5 @@
 from typing import List, Any
+from pathlib import Path
 from typing import Annotated
 import typer
 from bia_shared_datamodels import bia_data_model, uuid_creation, semantic_models
@@ -11,6 +12,7 @@
     image,
     specimen,
     creation_process,
+    propose,
 )
 from bia_assign_image.image_representation import get_image_representation
 from bia_assign_image.config import settings, api_client
@@ -62,7 +64,7 @@ def assign(
         PersistenceMode, typer.Option(case_sensitive=False)
     ] = PersistenceMode.api,
     dryrun: Annotated[bool, typer.Option()] = False,
-) -> None:
+) -> str:
     persister = persistence_strategy_factory(
         persistence_mode,
         output_dir_base=settings.bia_data_dir,
@@ -173,6 +175,7 @@ def assign(
         logger.info(
             f"Generated bia_data_model.Image object {bia_image.uuid} and persisted to {persistence_mode}"
         )
+    return str(bia_image.uuid)
 
 
 @representations_app.command(help="Create specified representations")
@@ -239,6 +242,55 @@ def create(
                 logger.warning(message)
 
 
+@app.command(help="Assign images from a proposal file")
+def assign_from_proposal(
+    proposal_path: Annotated[Path, typer.Argument(help="Path to the proposal file")],
+    persistence_mode: Annotated[
+        PersistenceMode, typer.Option(case_sensitive=False)
+    ] = PersistenceMode.api,
+    dryrun: Annotated[bool, typer.Option()] = False,
+) -> None:
+    """Process a proposal file and assign the file references to images"""
+    proposals = propose.read_proposals(proposal_path)
+
+
+    for p in proposals:
+        image_uuid = assign(
+            accession_id=p['accession_id'],
+            file_reference_uuids=[p['uuid']],
+            persistence_mode=persistence_mode,
+            dryrun=dryrun
+        )
+
+        if not dryrun:
+            # Create default representation
+            create(
+                accession_id=p['accession_id'],
+                image_uuid_list=[image_uuid],
+                persistence_mode=persistence_mode
+            )
+
+
+@app.command(help="Propose file references to convert for an accession")
+def propose_images(
+    accession_ids: Annotated[
+        List[str], typer.Argument(help="Accession IDs to process")
+    ],
+    output_path: Annotated[Path, typer.Argument(help="Path to write the proposals")],
+    max_items: Annotated[int, typer.Option(help="Maximum number of items to propose")] = 5,
+    append: Annotated[bool, typer.Option(help="Append to existing file instead of overwriting")] = True,
+) -> None:
+    """Propose file references to convert for the given accession IDs"""
+    for accession_id in accession_ids:
+        count = propose.write_convertible_file_references_for_accession_id(
+            accession_id,
+            output_path,
+            max_items=max_items,
+            append=append
+        )
+        logger.info(f"Wrote {count} proposals for {accession_id} to {output_path}")
+
+
 @app.callback()
 def main() -> None:
     return

diff --git a/bia-assign-image/bia_assign_image/config.py b/bia-assign-image/bia_assign_image/config.py
@@ -19,7 +19,7 @@ class Settings(BaseSettings):
         env_file=f"{Path(__file__).parent.parent / '.env'}",
         env_file_encoding="utf-8",
         case_sensitive=False,
-        # extra="forbid",
+        extra="allow",
     )
 
     bia_data_dir: str = Field(default_output_base)

diff --git a/bia-assign-image/bia_assign_image/propose.py b/bia-assign-image/bia_assign_image/propose.py
@@ -0,0 +1,184 @@
+"""Functions to allow proposing images to convert
+
+Propose file references to convert by sorting based on size,
+partitioning into n groups and randomly selecting one
+file reference from each group
+"""
+
+import math
+import random
+from typing import List, Dict
+from pathlib import Path
+from bia_assign_image.config import api_client
+from bia_assign_image.utils import in_bioformats_single_file_formats_list
+
+
+def select_indicies(n_indicies: int, n_to_select: int = 5) -> list[int]:
+    """Select a number of indicies from input list
+
+    Select a number of indicies from input list. Split list into
+    n_to_select chunks and randomly select an index from each chunk
+    """
+
+    # Seed to allow reproducibility on repeated runs.
+    # Note: Only applies to selections after 23/12/2024
+    random.seed(42)
+
+    if n_indicies <= n_to_select:
+        return list(range(n_indicies))
+
+    min_per_chunk = math.floor(n_indicies / n_to_select)
+    remainder = n_indicies % n_to_select
+    selected_indicies = []
+    stop = -1
+    for i in range(n_to_select):
+        n_per_chunk = min_per_chunk
+        if remainder > 0 and i < remainder:
+            n_per_chunk += 1
+        start = stop + 1
+        stop = start + n_per_chunk - 1
+        selected_index = random.randint(start, stop)
+        selected_indicies.append(selected_index)
+    return selected_indicies
+
+
+def count_lines(file_path):
+    with open(file_path, "r") as file:
+        return sum(1 for _ in file)
+
+
+def read_specific_line(file_path, line_number):
+    with open(file_path, "r") as file:
+        for current_line_number, line in enumerate(file, start=0):
+            if current_line_number == line_number:
+                return line  # .strip()
+    return None  # If the line number is beyond the end of the file
+
+
+def sizeof_fmt(num, suffix="B"):
+    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
+        if abs(num) < 1024.0:
+            return f"{num:3.1f}{unit}{suffix}"
+        num /= 1024.0
+    return f"{num:.1f}Yi{suffix}"
+
+
+def get_convertible_file_references(accession_id: str) -> List[Dict]:
+    """Get details of convertible images for given accession ID"""
+
+    # ToDo: Fix this to recursively call using until all data returned
+    PAGE_SIZE_DEFAULT = 10000000
+
+    study = api_client.search_study_by_accession(accession_id)
+    if not study:
+        return []
+    datasets = api_client.get_dataset_linking_study(
+        study.uuid, page_size=PAGE_SIZE_DEFAULT
+    )
+    file_references = []
+    for dataset in datasets:
+        file_references.extend(
+            api_client.get_file_reference_linking_dataset(
+                dataset.uuid, PAGE_SIZE_DEFAULT
+            )
+        )
+
+    convertible_file_references = [
+        {
+            "accession_id": accession_id,
+            "study_uuid": study.uuid,
+            "name": fr.file_path,
+            "uuid": fr.uuid,
+            "size_in_bytes": fr.size_in_bytes,
+            "size_human_readable": sizeof_fmt(fr.size_in_bytes),
+        }
+        for fr in file_references
+        if in_bioformats_single_file_formats_list(fr.file_path)
+    ]
+
+    convertible_file_references = sorted(
+        convertible_file_references,
+        key=lambda fr: (fr["size_in_bytes"], fr["name"]),
+        reverse=True,
+    )
+    return convertible_file_references
+
+
+def write_convertible_file_references_for_accession_id(
+    accession_id: str,
+    output_path: Path,
+    max_items: int = 5,
+    append: bool = True,
+) -> int:
+    """
+    Write details of file references proposed for conversion to file
+    """
+
+    convertible_file_references = get_convertible_file_references(accession_id)
+
+    n_proposal_candidates = len(convertible_file_references)
+    indicies_to_select = select_indicies(n_proposal_candidates, max_items)
+
+    if append:
+        open_text_mode = "a"
+    else:
+        open_text_mode = "w"
+
+    lines = [
+        "\t".join(
+            [
+                convertible_file_references[i]["accession_id"],
+                f"{convertible_file_references[i]['study_uuid']}",
+                convertible_file_references[i]["name"],
+                f"{convertible_file_references[i]['uuid']}",
+                f"{convertible_file_references[i]['size_in_bytes']}",
+                convertible_file_references[i]["size_human_readable"],
+            ]
+        )
+        for i in indicies_to_select
+    ]
+    with output_path.open(open_text_mode) as fid:
+        # If we are at start of file write header.
+        if fid.tell() == 0:
+            fid.writelines(
+                "\t".join(
+                    [
+                        "accession_id",
+                        "study_uuid",
+                        "name",
+                        "file_reference_uuid",
+                        "size_in_bytes",
+                        "size_human_readable",
+                    ]
+                )
+            )
+            fid.writelines("\n")
+        fid.writelines("\n".join(lines))
+        # Write a new line so next append starts on next line
+        fid.writelines("\n")
+
+    return len(indicies_to_select)
+
+
+def read_proposals(proposal_path: Path) -> List[Dict]:
+    """Read proposals from a tab-separated file
+    
+    Returns a list of dicts containing file reference info
+    """
+    proposals = []
+    with proposal_path.open('r') as f:
+        # Skip header
+        next(f)
+        for line in f:
+            if not line.strip():
+                continue
+            accession_id, study_uuid, name, file_ref_uuid, size, human_size = line.strip().split('\t')
+            proposals.append({
+                'accession_id': accession_id,
+                'study_uuid': study_uuid,
+                'name': name, 
+                'uuid': file_ref_uuid,
+                'size_in_bytes': int(size),
+                'size_human_readable': human_size
+            })
+    return proposals
diff --git a/bia-assign-image/bia_assign_image/resources/bioformats_curated_file_formats_readme.txt b/bia-assign-image/bia_assign_image/resources/bioformats_curated_file_formats_readme.txt
@@ -0,0 +1,14 @@
+Curated files last updated 21/05/2024
+
+Notes on creating the curated files:
+
+1. Copy supported formats from https://bio-formats.readthedocs.io/en/stable/supported-formats.html
+2. Paste into spreadsheet or text editor and get all extensions
+3. Ensure extensions are unique and sorted
+4. Manually curate into 'bioformats_curated_single_file_formats.txt' Which have 1-2-1 conversion with bioformats2raw and bioformats_curated_other_file_formats.txt which require more input for conversion (e.g. pattern files)
+
+The above steps can be accomplished in a browser developer console using the following js snippet (thanks to LA):
+
+[...new Set(Array.from(document.getElementsByTagName("tbody")[0].querySelectorAll("td:nth-child(2)")).map(el => el.innerText.split(",")).flat().filter(extension => extension.length))].sort()
+
+TODO: write python version of js snippet e.g. using selinium