From 8337402d847c79baf11471a54060d921c2a33a25 Mon Sep 17 00:00:00 2001
From: matthewh-ebi <92516353+matthewh-ebi@users.noreply.github.com>
Date: Fri, 7 Feb 2025 18:25:21 +0000
Subject: [PATCH] Assign images with proposals (#302)

* Add bia_curation to provenance list for attributes

* feat: Add propose-files command to CLI for proposing convertible file references

* feat: Add propose.py for image assignment functionality

* feat: Add assign-from-proposal command to process file references from proposal files

* feat: Return image UUID from assign and create default representation in assign_from_proposal

* Include use of propose in image assignment

* Alter propose function to allow multiple accession IDs

* docs: Add propose and process workflow to bia-assign-image README
---
 bia-assign-image/README.md                    |  34 +++-
 bia-assign-image/bia_assign_image/cli.py      |  54 ++++-
 bia-assign-image/bia_assign_image/config.py   |   2 +-
 bia-assign-image/bia_assign_image/propose.py  | 184 ++++++++++++++++++
 ...bioformats_curated_file_formats_readme.txt |  14 ++
 .../bioformats_curated_other_file_formats.txt |  66 +++++++
 ...bioformats_curated_single_file_formats.txt | 120 ++++++++++++
 bia-assign-image/bia_assign_image/utils.py    |  50 +++++
 .../bia_shared_datamodels/semantic_models.py  |   2 +
 9 files changed, 517 insertions(+), 9 deletions(-)
 create mode 100644 bia-assign-image/bia_assign_image/propose.py
 create mode 100644 bia-assign-image/bia_assign_image/resources/bioformats_curated_file_formats_readme.txt
 create mode 100644 bia-assign-image/bia_assign_image/resources/bioformats_curated_other_file_formats.txt
 create mode 100644 bia-assign-image/bia_assign_image/resources/bioformats_curated_single_file_formats.txt
 create mode 100644 bia-assign-image/bia_assign_image/utils.py
diff --git a/bia-assign-image/README.md b/bia-assign-image/README.md
index 580fa401..8f6c3e2e 100644
--- a/bia-assign-image/README.md
+++ b/bia-assign-image/README.md
@@ -6,20 +6,40 @@ This sub-package assigns file reference(s) to BIA Image objects and creates imag
 Install the project using poetry.
 
 ## Usage
-This package has 2 cli applications:
+This package has the following CLI commands:
+ * **propose-images**: generate proposals for convertible images from accessions
+ * **assign-from-proposal**: process a proposal file to create images and representations
  * **assign**: used to assign file reference(s) to BIA Image objects
- * **representations**: used to create image representation objects (without conversion of images) from BIA Image objects.
+ * **representations**: used to create image representation objects (without conversion of images) from BIA Image objects
 
 The artefacts created are saved to the API by default. The current version of the cli allows saving
-to disk using the option `--persistence-mode disk` on either command. However, this will be deprecated in
+to disk using the option `--persistence-mode disk` on any command. However, this will be deprecated in
 a future revision.
-## Assigning file reference(s) to BIA Image objects (in other words, create a BIA Image object for those file reference(s))
-To create a BIA Image of a set of file references run:
-``` sh
+
+## Proposing and Processing Images
+The recommended workflow is to first generate proposals for which images to convert:
+
+```sh
+poetry run bia-assign-image propose-images S-BIAD1 proposals.txt --max-items 5
+```
+
+This will analyze the accession and suggest up to 5 file references to convert, writing them to proposals.txt.
+You can specify multiple accession IDs and use --append to add to an existing proposal file.
+
+Then process the proposals to create images and representations:
+
+```sh
+poetry run bia-assign-image assign-from-proposal proposals.txt
+```
+
+This will create BIA Image objects and default representations for each proposed file reference.
+## Manual Assignment
+To directly create a BIA Image from file references without using proposals, run:
+```sh
 poetry run bia-assign-image assign <STUDY ACCESSION ID> <LIST OF FILE REFERENCE UUIDS>
 ```
 E.g. Assuming the study S-BIAD1285 has been ingested:
-```
+```sh
 poetry run bia-assign-image assign S-BIAD1285 b768fb72-7ea2-4b80-b54d-bdf5ca280bfd
 ```
 
diff --git a/bia-assign-image/bia_assign_image/cli.py b/bia-assign-image/bia_assign_image/cli.py
index 56e699a8..3568f443 100644
--- a/bia-assign-image/bia_assign_image/cli.py
+++ b/bia-assign-image/bia_assign_image/cli.py
@@ -1,4 +1,5 @@
 from typing import List, Any
+from pathlib import Path
 from typing import Annotated
 import typer
 from bia_shared_datamodels import bia_data_model, uuid_creation, semantic_models
@@ -11,6 +12,7 @@
     image,
     specimen,
     creation_process,
+    propose,
 )
 from bia_assign_image.image_representation import get_image_representation
 from bia_assign_image.config import settings, api_client
@@ -62,7 +64,7 @@ def assign(
         PersistenceMode, typer.Option(case_sensitive=False)
     ] = PersistenceMode.api,
     dryrun: Annotated[bool, typer.Option()] = False,
-) -> None:
+) -> str:
     persister = persistence_strategy_factory(
         persistence_mode,
         output_dir_base=settings.bia_data_dir,
@@ -173,6 +175,7 @@ def assign(
         logger.info(
             f"Generated bia_data_model.Image object {bia_image.uuid} and persisted to {persistence_mode}"
         )
+    return str(bia_image.uuid)
 
 
 @representations_app.command(help="Create specified representations")
@@ -239,6 +242,55 @@ def create(
                 logger.warning(message)
 
 
+@app.command(help="Assign images from a proposal file")
+def assign_from_proposal(
+    proposal_path: Annotated[Path, typer.Argument(help="Path to the proposal file")],
+    persistence_mode: Annotated[
+        PersistenceMode, typer.Option(case_sensitive=False)
+    ] = PersistenceMode.api,
+    dryrun: Annotated[bool, typer.Option()] = False,
+) -> None:
+    """Process a proposal file and assign the file references to images"""
+    proposals = propose.read_proposals(proposal_path)
+
+
+    for p in proposals:
+        image_uuid = assign(
+            accession_id=p['accession_id'],
+            file_reference_uuids=[p['uuid']],
+            persistence_mode=persistence_mode,
+            dryrun=dryrun
+        )
+        
+        if not dryrun:
+            # Create default representation
+            create(
+                accession_id=p['accession_id'],
+                image_uuid_list=[image_uuid],
+                persistence_mode=persistence_mode
+            )
+
+
+@app.command(help="Propose file references to convert for an accession")
+def propose_images(
+    accession_ids: Annotated[
+        List[str], typer.Argument(help="Accession IDs to process")
+    ],
+    output_path: Annotated[Path, typer.Argument(help="Path to write the proposals")],
+    max_items: Annotated[int, typer.Option(help="Maximum number of items to propose")] = 5,
+    append: Annotated[bool, typer.Option(help="Append to existing file instead of overwriting")] = True,
+) -> None:
+    """Propose file references to convert for the given accession IDs"""
+    for accession_id in accession_ids:
+        count = propose.write_convertible_file_references_for_accession_id(
+            accession_id,
+            output_path,
+            max_items=max_items,
+            append=append
+        )
+        logger.info(f"Wrote {count} proposals for {accession_id} to {output_path}")
+
+
 @app.callback()
 def main() -> None:
     return
diff --git a/bia-assign-image/bia_assign_image/config.py b/bia-assign-image/bia_assign_image/config.py
index 8faebd52..566e40d2 100644
--- a/bia-assign-image/bia_assign_image/config.py
+++ b/bia-assign-image/bia_assign_image/config.py
@@ -19,7 +19,7 @@ class Settings(BaseSettings):
         env_file=f"{Path(__file__).parent.parent / '.env'}",
         env_file_encoding="utf-8",
         case_sensitive=False,
-        # extra="forbid",
+        extra="allow",
     )
 
     bia_data_dir: str = Field(default_output_base)
diff --git a/bia-assign-image/bia_assign_image/propose.py b/bia-assign-image/bia_assign_image/propose.py
new file mode 100644
index 00000000..6e66c76f
--- /dev/null
+++ b/bia-assign-image/bia_assign_image/propose.py
@@ -0,0 +1,184 @@
+"""Functions to allow proposing images to convert
+
+Propose file references to convert by sorting based on size,
+partitioning into n groups and randomly selecting one
+file reference from each group
+"""
+
+import math
+import random
+from typing import List, Dict
+from pathlib import Path
+from bia_assign_image.config import api_client
+from bia_assign_image.utils import in_bioformats_single_file_formats_list
+
+
+def select_indicies(n_indicies: int, n_to_select: int = 5) -> list[int]:
+    """Select a number of indicies from input list
+
+    Select a number of indicies from input list. Split list into
+    n_to_select chunks and randomly select an index from each chunk
+    """
+
+    # Seed to allow reproducibility on repeated runs.
+    # Note: Only applies to selections after 23/12/2024
+    random.seed(42)
+
+    if n_indicies <= n_to_select:
+        return list(range(n_indicies))
+
+    min_per_chunk = math.floor(n_indicies / n_to_select)
+    remainder = n_indicies % n_to_select
+    selected_indicies = []
+    stop = -1
+    for i in range(n_to_select):
+        n_per_chunk = min_per_chunk
+        if remainder > 0 and i < remainder:
+            n_per_chunk += 1
+        start = stop + 1
+        stop = start + n_per_chunk - 1
+        selected_index = random.randint(start, stop)
+        selected_indicies.append(selected_index)
+    return selected_indicies
+
+
+def count_lines(file_path):
+    with open(file_path, "r") as file:
+        return sum(1 for _ in file)
+
+
+def read_specific_line(file_path, line_number):
+    with open(file_path, "r") as file:
+        for current_line_number, line in enumerate(file, start=0):
+            if current_line_number == line_number:
+                return line  # .strip()
+    return None  # If the line number is beyond the end of the file
+
+
+def sizeof_fmt(num, suffix="B"):
+    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
+        if abs(num) < 1024.0:
+            return f"{num:3.1f}{unit}{suffix}"
+        num /= 1024.0
+    return f"{num:.1f}Yi{suffix}"
+
+
+def get_convertible_file_references(accession_id: str) -> List[Dict]:
+    """Get details of convertible images for given accession ID"""
+
+    # ToDo: Fix this to recursively call using until all data returned
+    PAGE_SIZE_DEFAULT = 10000000
+
+    study = api_client.search_study_by_accession(accession_id)
+    if not study:
+        return []
+    datasets = api_client.get_dataset_linking_study(
+        study.uuid, page_size=PAGE_SIZE_DEFAULT
+    )
+    file_references = []
+    for dataset in datasets:
+        file_references.extend(
+            api_client.get_file_reference_linking_dataset(
+                dataset.uuid, PAGE_SIZE_DEFAULT
+            )
+        )
+
+    convertible_file_references = [
+        {
+            "accession_id": accession_id,
+            "study_uuid": study.uuid,
+            "name": fr.file_path,
+            "uuid": fr.uuid,
+            "size_in_bytes": fr.size_in_bytes,
+            "size_human_readable": sizeof_fmt(fr.size_in_bytes),
+        }
+        for fr in file_references
+        if in_bioformats_single_file_formats_list(fr.file_path)
+    ]
+
+    convertible_file_references = sorted(
+        convertible_file_references,
+        key=lambda fr: (fr["size_in_bytes"], fr["name"]),
+        reverse=True,
+    )
+    return convertible_file_references
+
+
+def write_convertible_file_references_for_accession_id(
+    accession_id: str,
+    output_path: Path,
+    max_items: int = 5,
+    append: bool = True,
+) -> int:
+    """
+    Write details of file references proposed for conversion to file
+    """
+
+    convertible_file_references = get_convertible_file_references(accession_id)
+
+    n_proposal_candidates = len(convertible_file_references)
+    indicies_to_select = select_indicies(n_proposal_candidates, max_items)
+
+    if append:
+        open_text_mode = "a"
+    else:
+        open_text_mode = "w"
+
+    lines = [
+        "\t".join(
+            [
+                convertible_file_references[i]["accession_id"],
+                f"{convertible_file_references[i]['study_uuid']}",
+                convertible_file_references[i]["name"],
+                f"{convertible_file_references[i]['uuid']}",
+                f"{convertible_file_references[i]['size_in_bytes']}",
+                convertible_file_references[i]["size_human_readable"],
+            ]
+        )
+        for i in indicies_to_select
+    ]
+    with output_path.open(open_text_mode) as fid:
+        # If we are at start of file write header.
+        if fid.tell() == 0:
+            fid.writelines(
+                "\t".join(
+                    [
+                        "accession_id",
+                        "study_uuid",
+                        "name",
+                        "file_reference_uuid",
+                        "size_in_bytes",
+                        "size_human_readable",
+                    ]
+                )
+            )
+            fid.writelines("\n")
+        fid.writelines("\n".join(lines))
+        # Write a new line so next append starts on next line
+        fid.writelines("\n")
+
+    return len(indicies_to_select)
+
+
+def read_proposals(proposal_path: Path) -> List[Dict]:
+    """Read proposals from a tab-separated file
+    
+    Returns a list of dicts containing file reference info
+    """
+    proposals = []
+    with proposal_path.open('r') as f:
+        # Skip header
+        next(f)
+        for line in f:
+            if not line.strip():
+                continue
+            accession_id, study_uuid, name, file_ref_uuid, size, human_size = line.strip().split('\t')
+            proposals.append({
+                'accession_id': accession_id,
+                'study_uuid': study_uuid,
+                'name': name, 
+                'uuid': file_ref_uuid,
+                'size_in_bytes': int(size),
+                'size_human_readable': human_size
+            })
+    return proposals
diff --git a/bia-assign-image/bia_assign_image/resources/bioformats_curated_file_formats_readme.txt b/bia-assign-image/bia_assign_image/resources/bioformats_curated_file_formats_readme.txt
new file mode 100644
index 00000000..2e7dac28
--- /dev/null
+++ b/bia-assign-image/bia_assign_image/resources/bioformats_curated_file_formats_readme.txt
@@ -0,0 +1,14 @@
+Curated files last updated 21/05/2024
+
+Notes on creating the curated files:
+
+1. Copy supported formats from https://bio-formats.readthedocs.io/en/stable/supported-formats.html
+2. Paste into spreadsheet or text editor and get all extensions
+3. Ensure extensions are unique and sorted
+4. Manually curate into 'bioformats_curated_single_file_formats.txt' Which have 1-2-1 conversion with bioformats2raw and bioformats_curated_other_file_formats.txt which require more input for conversion (e.g. pattern files)
+
+The above steps can be accomplished in a browser developer console using the following js snippet (thanks to LA):
+
+[...new Set(Array.from(document.getElementsByTagName("tbody")[0].querySelectorAll("td:nth-child(2)")).map(el => el.innerText.split(",")).flat().filter(extension => extension.length))].sort()
+
+TODO: write python version of js snippet e.g. using selinium
diff --git a/bia-assign-image/bia_assign_image/resources/bioformats_curated_other_file_formats.txt b/bia-assign-image/bia_assign_image/resources/bioformats_curated_other_file_formats.txt
new file mode 100644
index 00000000..d311ed78
--- /dev/null
+++ b/bia-assign-image/bia_assign_image/resources/bioformats_curated_other_file_formats.txt
@@ -0,0 +1,66 @@
+.afi
+.apl
+.avi
+.c01
+.cfg
+.csv
+.dat
+.db
+.dcm
+.dib
+.dicom
+.dv
+.exp
+.flex
+.h5
+.hdr
+.hed
+.htd
+.html
+.ics
+.ids
+.l2d
+.labels
+.lei
+.mdb
+.mea
+.mov
+.mtb
+.mvd2
+.nd
+.ndpis
+.nhdr
+.nii.gz
+.nrrd
+.obf
+.obsep
+.oib
+.oif
+.ome
+.ome.btf
+.ome.tf2
+.ome.tf8
+.ome.tif
+.ome.tiff
+.ome.xml
+.omp2info
+.par
+.pcoraw
+.pds
+.pic
+.pnl
+.r3d
+.rcpnl
+.res
+.spc
+.stk
+.tnb
+.txt
+.vff
+.vms
+.vsi
+.vws
+.wpi
+.xdce
+.xml
+.xys
diff --git a/bia-assign-image/bia_assign_image/resources/bioformats_curated_single_file_formats.txt b/bia-assign-image/bia_assign_image/resources/bioformats_curated_single_file_formats.txt
new file mode 100644
index 00000000..4c5918a8
--- /dev/null
+++ b/bia-assign-image/bia_assign_image/resources/bioformats_curated_single_file_formats.txt
@@ -0,0 +1,120 @@
+.1sc
+.2fl
+.acff
+.afm
+.aim
+.al3d
+.ali
+.am
+.amiramesh
+.arf
+.bif
+.bin
+.bip
+.bmp
+.btf
+.ch5
+.cif
+.cr2
+.crw
+.cxd
+.czi
+.dm2
+.dm3
+.dm4
+.dti
+.eps
+.epsi
+.fdf
+.fff
+.ffr
+.fits
+.fli
+.frm
+.gel
+.gif
+.grey
+.hdf
+.his
+.hx
+.i2i
+.im3
+.img
+.ims
+.inr
+.ipl
+.ipm
+.ipw
+.j2k
+.jp2
+.jpeg
+.jpf
+.jpg
+.jpk
+.jpx
+.klb
+.lif
+.liff
+.lim
+.lms
+.lof
+.lsm
+.map
+.mnc
+.mng
+.mod
+.mrc
+.mrcs
+.mrw
+.msr
+.naf
+.nd2
+.ndpi
+.nef
+.nii
+.oir
+.pbm
+.pcx
+.pgm
+.pict
+.png
+.ppm
+.pr3
+.ps
+.psd
+.qptiff
+.raw
+.rec
+.scn
+.sdt
+.seq
+.sif
+.sld
+.sldy
+.sm2
+.sm3
+.spe
+.spi
+.st
+.stp
+.svs
+.sxm
+.tf2
+.tf8
+.tfr
+.tga
+.tif
+.tiff
+tiff
+.top
+.v
+.wat
+.wav
+.wlz
+.xlef
+.xqd
+.xqf
+.xv
+.zfp
+.zfr
+.zvi
diff --git a/bia-assign-image/bia_assign_image/utils.py b/bia-assign-image/bia_assign_image/utils.py
new file mode 100644
index 00000000..c2d97f4d
--- /dev/null
+++ b/bia-assign-image/bia_assign_image/utils.py
@@ -0,0 +1,50 @@
+from pathlib import Path
+
+single_file_formats_path = (
+    Path(__file__).parent / "resources" / "bioformats_curated_single_file_formats.txt"
+)
+single_file_formats = [
+    s for s in single_file_formats_path.read_text().split("\n") if len(s) > 0
+]
+
+
+def extension_in_bioformats_single_file_formats_list(ext: str) -> bool:
+    if len(ext) > 1 and not ext[0] == ".":
+        ext = "." + ext
+    return ext in single_file_formats
+
+
+def in_bioformats_single_file_formats_list(file_location: [Path | str]) -> bool: # type: ignore
+    """Check if ext of path/uri/name of file in bioformats single file formats list"""
+    ext = get_image_extension(f"{file_location}")
+    return extension_in_bioformats_single_file_formats_list(ext)
+
+
+def get_image_extension(file_path: str) -> str:
+    """Return standardized image extension for a given file path."""
+
+    # Process files with multi suffix extensions
+    multi_suffix_ext = {
+        ".ome.zarr.zip": ".ome.zarr.zip",
+        ".zarr.zip": ".zarr.zip",
+        ".ome.zarr": ".ome.zarr",
+        ".ome.tiff": ".ome.tiff",
+        ".ome.tif": ".ome.tiff",
+        ".tar.gz": ".tar.gz",
+    }
+
+    for ext, mapped_value in multi_suffix_ext.items():
+        if file_path.lower().endswith(ext):
+            return mapped_value
+
+    # Standardise extensions expressed using different suffixes
+    ext_map = {
+        ".jpeg": ".jpg",
+        ".tif": ".tiff",
+    }
+
+    ext = Path(file_path).suffix.lower()
+    if ext in ext_map:
+        return ext_map[ext]
+    else:
+        return ext
\ No newline at end of file
diff --git a/bia-shared-datamodels/src/bia_shared_datamodels/semantic_models.py b/bia-shared-datamodels/src/bia_shared_datamodels/semantic_models.py
index 8e01f34c..8eb656bd 100644
--- a/bia-shared-datamodels/src/bia_shared_datamodels/semantic_models.py
+++ b/bia-shared-datamodels/src/bia_shared_datamodels/semantic_models.py
@@ -24,6 +24,8 @@ class AttributeProvenance(str, Enum):
 
     bia_conversion = "bia_conversion"
 
+    bia_curation = "bia_curation"
+
 
 class Attribute(ConfiguredBaseModel):
     provenance: AttributeProvenance = Field(