Skip to content

Commit

Permalink
Assign images with proposals (#302)
Browse files Browse the repository at this point in the history
* Add bia_curation to provenance list for attributes

* feat: Add propose-files command to CLI for proposing convertible file references

* feat: Add propose.py for image assignment functionality

* feat: Add assign-from-proposal command to process file references from proposal files

* feat: Return image UUID from assign and create default representation in assign_from_proposal

* Include use of propose in image assignment

* Alter propose function to allow multiple accession IDs

* docs: Add propose and process workflow to bia-assign-image README
  • Loading branch information
matthewh-ebi authored Feb 7, 2025
1 parent 3bdfe5a commit 8337402
Show file tree
Hide file tree
Showing 9 changed files with 517 additions and 9 deletions.
34 changes: 27 additions & 7 deletions bia-assign-image/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,40 @@ This sub-package assigns file reference(s) to BIA Image objects and creates imag
Install the project using poetry.

## Usage
This package has 2 cli applications:
This package has the following CLI commands:
* **propose-images**: generate proposals for convertible images from accessions
* **assign-from-proposal**: process a proposal file to create images and representations
* **assign**: used to assign file reference(s) to BIA Image objects
* **representations**: used to create image representation objects (without conversion of images) from BIA Image objects.
* **representations**: used to create image representation objects (without conversion of images) from BIA Image objects

The artefacts created are saved to the API by default. The current version of the cli allows saving
to disk using the option `--persistence-mode disk` on either command. However, this will be deprecated in
to disk using the option `--persistence-mode disk` on any command. However, this will be deprecated in
a future revision.
## Assigning file reference(s) to BIA Image objects (in other words, create a BIA Image object for those file reference(s))
To create a BIA Image of a set of file references run:
``` sh

## Proposing and Processing Images
The recommended workflow is to first generate proposals for which images to convert:

```sh
poetry run bia-assign-image propose-images S-BIAD1 proposals.txt --max-items 5
```

This will analyze the accession and suggest up to 5 file references to convert, writing them to proposals.txt.
You can specify multiple accession IDs and use --append to add to an existing proposal file.

Then process the proposals to create images and representations:

```sh
poetry run bia-assign-image assign-from-proposal proposals.txt
```

This will create BIA Image objects and default representations for each proposed file reference.
## Manual Assignment
To directly create a BIA Image from file references without using proposals, run:
```sh
poetry run bia-assign-image assign <STUDY ACCESSION ID> <LIST OF FILE REFERENCE UUIDS>
```
E.g. Assuming the study S-BIAD1285 has been ingested:
```
```sh
poetry run bia-assign-image assign S-BIAD1285 b768fb72-7ea2-4b80-b54d-bdf5ca280bfd
```

Expand Down
54 changes: 53 additions & 1 deletion bia-assign-image/bia_assign_image/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Any
from pathlib import Path
from typing import Annotated
import typer
from bia_shared_datamodels import bia_data_model, uuid_creation, semantic_models
Expand All @@ -11,6 +12,7 @@
image,
specimen,
creation_process,
propose,
)
from bia_assign_image.image_representation import get_image_representation
from bia_assign_image.config import settings, api_client
Expand Down Expand Up @@ -62,7 +64,7 @@ def assign(
PersistenceMode, typer.Option(case_sensitive=False)
] = PersistenceMode.api,
dryrun: Annotated[bool, typer.Option()] = False,
) -> None:
) -> str:
persister = persistence_strategy_factory(
persistence_mode,
output_dir_base=settings.bia_data_dir,
Expand Down Expand Up @@ -173,6 +175,7 @@ def assign(
logger.info(
f"Generated bia_data_model.Image object {bia_image.uuid} and persisted to {persistence_mode}"
)
return str(bia_image.uuid)


@representations_app.command(help="Create specified representations")
Expand Down Expand Up @@ -239,6 +242,55 @@ def create(
logger.warning(message)


@app.command(help="Assign images from a proposal file")
def assign_from_proposal(
proposal_path: Annotated[Path, typer.Argument(help="Path to the proposal file")],
persistence_mode: Annotated[
PersistenceMode, typer.Option(case_sensitive=False)
] = PersistenceMode.api,
dryrun: Annotated[bool, typer.Option()] = False,
) -> None:
"""Process a proposal file and assign the file references to images"""
proposals = propose.read_proposals(proposal_path)


for p in proposals:
image_uuid = assign(
accession_id=p['accession_id'],
file_reference_uuids=[p['uuid']],
persistence_mode=persistence_mode,
dryrun=dryrun
)

if not dryrun:
# Create default representation
create(
accession_id=p['accession_id'],
image_uuid_list=[image_uuid],
persistence_mode=persistence_mode
)


@app.command(help="Propose file references to convert for an accession")
def propose_images(
accession_ids: Annotated[
List[str], typer.Argument(help="Accession IDs to process")
],
output_path: Annotated[Path, typer.Argument(help="Path to write the proposals")],
max_items: Annotated[int, typer.Option(help="Maximum number of items to propose")] = 5,
append: Annotated[bool, typer.Option(help="Append to existing file instead of overwriting")] = True,
) -> None:
"""Propose file references to convert for the given accession IDs"""
for accession_id in accession_ids:
count = propose.write_convertible_file_references_for_accession_id(
accession_id,
output_path,
max_items=max_items,
append=append
)
logger.info(f"Wrote {count} proposals for {accession_id} to {output_path}")


@app.callback()
def main() -> None:
return
Expand Down
2 changes: 1 addition & 1 deletion bia-assign-image/bia_assign_image/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class Settings(BaseSettings):
env_file=f"{Path(__file__).parent.parent / '.env'}",
env_file_encoding="utf-8",
case_sensitive=False,
# extra="forbid",
extra="allow",
)

bia_data_dir: str = Field(default_output_base)
Expand Down
184 changes: 184 additions & 0 deletions bia-assign-image/bia_assign_image/propose.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""Functions to allow proposing images to convert
Propose file references to convert by sorting based on size,
partitioning into n groups and randomly selecting one
file reference from each group
"""

import math
import random
from typing import List, Dict
from pathlib import Path
from bia_assign_image.config import api_client
from bia_assign_image.utils import in_bioformats_single_file_formats_list


def select_indicies(n_indicies: int, n_to_select: int = 5) -> list[int]:
"""Select a number of indicies from input list
Select a number of indicies from input list. Split list into
n_to_select chunks and randomly select an index from each chunk
"""

# Seed to allow reproducibility on repeated runs.
# Note: Only applies to selections after 23/12/2024
random.seed(42)

if n_indicies <= n_to_select:
return list(range(n_indicies))

min_per_chunk = math.floor(n_indicies / n_to_select)
remainder = n_indicies % n_to_select
selected_indicies = []
stop = -1
for i in range(n_to_select):
n_per_chunk = min_per_chunk
if remainder > 0 and i < remainder:
n_per_chunk += 1
start = stop + 1
stop = start + n_per_chunk - 1
selected_index = random.randint(start, stop)
selected_indicies.append(selected_index)
return selected_indicies


def count_lines(file_path):
with open(file_path, "r") as file:
return sum(1 for _ in file)


def read_specific_line(file_path, line_number):
with open(file_path, "r") as file:
for current_line_number, line in enumerate(file, start=0):
if current_line_number == line_number:
return line # .strip()
return None # If the line number is beyond the end of the file


def sizeof_fmt(num, suffix="B"):
for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
if abs(num) < 1024.0:
return f"{num:3.1f}{unit}{suffix}"
num /= 1024.0
return f"{num:.1f}Yi{suffix}"


def get_convertible_file_references(accession_id: str) -> List[Dict]:
"""Get details of convertible images for given accession ID"""

# ToDo: Fix this to recursively call using until all data returned
PAGE_SIZE_DEFAULT = 10000000

study = api_client.search_study_by_accession(accession_id)
if not study:
return []
datasets = api_client.get_dataset_linking_study(
study.uuid, page_size=PAGE_SIZE_DEFAULT
)
file_references = []
for dataset in datasets:
file_references.extend(
api_client.get_file_reference_linking_dataset(
dataset.uuid, PAGE_SIZE_DEFAULT
)
)

convertible_file_references = [
{
"accession_id": accession_id,
"study_uuid": study.uuid,
"name": fr.file_path,
"uuid": fr.uuid,
"size_in_bytes": fr.size_in_bytes,
"size_human_readable": sizeof_fmt(fr.size_in_bytes),
}
for fr in file_references
if in_bioformats_single_file_formats_list(fr.file_path)
]

convertible_file_references = sorted(
convertible_file_references,
key=lambda fr: (fr["size_in_bytes"], fr["name"]),
reverse=True,
)
return convertible_file_references


def write_convertible_file_references_for_accession_id(
accession_id: str,
output_path: Path,
max_items: int = 5,
append: bool = True,
) -> int:
"""
Write details of file references proposed for conversion to file
"""

convertible_file_references = get_convertible_file_references(accession_id)

n_proposal_candidates = len(convertible_file_references)
indicies_to_select = select_indicies(n_proposal_candidates, max_items)

if append:
open_text_mode = "a"
else:
open_text_mode = "w"

lines = [
"\t".join(
[
convertible_file_references[i]["accession_id"],
f"{convertible_file_references[i]['study_uuid']}",
convertible_file_references[i]["name"],
f"{convertible_file_references[i]['uuid']}",
f"{convertible_file_references[i]['size_in_bytes']}",
convertible_file_references[i]["size_human_readable"],
]
)
for i in indicies_to_select
]
with output_path.open(open_text_mode) as fid:
# If we are at start of file write header.
if fid.tell() == 0:
fid.writelines(
"\t".join(
[
"accession_id",
"study_uuid",
"name",
"file_reference_uuid",
"size_in_bytes",
"size_human_readable",
]
)
)
fid.writelines("\n")
fid.writelines("\n".join(lines))
# Write a new line so next append starts on next line
fid.writelines("\n")

return len(indicies_to_select)


def read_proposals(proposal_path: Path) -> List[Dict]:
"""Read proposals from a tab-separated file
Returns a list of dicts containing file reference info
"""
proposals = []
with proposal_path.open('r') as f:
# Skip header
next(f)
for line in f:
if not line.strip():
continue
accession_id, study_uuid, name, file_ref_uuid, size, human_size = line.strip().split('\t')
proposals.append({
'accession_id': accession_id,
'study_uuid': study_uuid,
'name': name,
'uuid': file_ref_uuid,
'size_in_bytes': int(size),
'size_human_readable': human_size
})
return proposals
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Curated files last updated 21/05/2024

Notes on creating the curated files:

1. Copy supported formats from https://bio-formats.readthedocs.io/en/stable/supported-formats.html
2. Paste into spreadsheet or text editor and get all extensions
3. Ensure extensions are unique and sorted
4. Manually curate into 'bioformats_curated_single_file_formats.txt' Which have 1-2-1 conversion with bioformats2raw and bioformats_curated_other_file_formats.txt which require more input for conversion (e.g. pattern files)

The above steps can be accomplished in a browser developer console using the following js snippet (thanks to LA):

[...new Set(Array.from(document.getElementsByTagName("tbody")[0].querySelectorAll("td:nth-child(2)")).map(el => el.innerText.split(",")).flat().filter(extension => extension.length))].sort()

TODO: write python version of js snippet e.g. using selinium
Loading

0 comments on commit 8337402

Please sign in to comment.