-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add bia_curation to provenance list for attributes * feat: Add propose-files command to CLI for proposing convertible file references * feat: Add propose.py for image assignment functionality * feat: Add assign-from-proposal command to process file references from proposal files * feat: Return image UUID from assign and create default representation in assign_from_proposal * Include use of propose in image assignment * Alter propose function to allow multiple accession IDs * docs: Add propose and process workflow to bia-assign-image README
- Loading branch information
1 parent
3bdfe5a
commit 8337402
Showing
9 changed files
with
517 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
"""Functions to allow proposing images to convert | ||
Propose file references to convert by sorting based on size, | ||
partitioning into n groups and randomly selecting one | ||
file reference from each group | ||
""" | ||
|
||
import math | ||
import random | ||
from typing import List, Dict | ||
from pathlib import Path | ||
from bia_assign_image.config import api_client | ||
from bia_assign_image.utils import in_bioformats_single_file_formats_list | ||
|
||
|
||
def select_indicies(n_indicies: int, n_to_select: int = 5) -> list[int]: | ||
"""Select a number of indicies from input list | ||
Select a number of indicies from input list. Split list into | ||
n_to_select chunks and randomly select an index from each chunk | ||
""" | ||
|
||
# Seed to allow reproducibility on repeated runs. | ||
# Note: Only applies to selections after 23/12/2024 | ||
random.seed(42) | ||
|
||
if n_indicies <= n_to_select: | ||
return list(range(n_indicies)) | ||
|
||
min_per_chunk = math.floor(n_indicies / n_to_select) | ||
remainder = n_indicies % n_to_select | ||
selected_indicies = [] | ||
stop = -1 | ||
for i in range(n_to_select): | ||
n_per_chunk = min_per_chunk | ||
if remainder > 0 and i < remainder: | ||
n_per_chunk += 1 | ||
start = stop + 1 | ||
stop = start + n_per_chunk - 1 | ||
selected_index = random.randint(start, stop) | ||
selected_indicies.append(selected_index) | ||
return selected_indicies | ||
|
||
|
||
def count_lines(file_path): | ||
with open(file_path, "r") as file: | ||
return sum(1 for _ in file) | ||
|
||
|
||
def read_specific_line(file_path, line_number): | ||
with open(file_path, "r") as file: | ||
for current_line_number, line in enumerate(file, start=0): | ||
if current_line_number == line_number: | ||
return line # .strip() | ||
return None # If the line number is beyond the end of the file | ||
|
||
|
||
def sizeof_fmt(num, suffix="B"): | ||
for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: | ||
if abs(num) < 1024.0: | ||
return f"{num:3.1f}{unit}{suffix}" | ||
num /= 1024.0 | ||
return f"{num:.1f}Yi{suffix}" | ||
|
||
|
||
def get_convertible_file_references(accession_id: str) -> List[Dict]: | ||
"""Get details of convertible images for given accession ID""" | ||
|
||
# ToDo: Fix this to recursively call using until all data returned | ||
PAGE_SIZE_DEFAULT = 10000000 | ||
|
||
study = api_client.search_study_by_accession(accession_id) | ||
if not study: | ||
return [] | ||
datasets = api_client.get_dataset_linking_study( | ||
study.uuid, page_size=PAGE_SIZE_DEFAULT | ||
) | ||
file_references = [] | ||
for dataset in datasets: | ||
file_references.extend( | ||
api_client.get_file_reference_linking_dataset( | ||
dataset.uuid, PAGE_SIZE_DEFAULT | ||
) | ||
) | ||
|
||
convertible_file_references = [ | ||
{ | ||
"accession_id": accession_id, | ||
"study_uuid": study.uuid, | ||
"name": fr.file_path, | ||
"uuid": fr.uuid, | ||
"size_in_bytes": fr.size_in_bytes, | ||
"size_human_readable": sizeof_fmt(fr.size_in_bytes), | ||
} | ||
for fr in file_references | ||
if in_bioformats_single_file_formats_list(fr.file_path) | ||
] | ||
|
||
convertible_file_references = sorted( | ||
convertible_file_references, | ||
key=lambda fr: (fr["size_in_bytes"], fr["name"]), | ||
reverse=True, | ||
) | ||
return convertible_file_references | ||
|
||
|
||
def write_convertible_file_references_for_accession_id( | ||
accession_id: str, | ||
output_path: Path, | ||
max_items: int = 5, | ||
append: bool = True, | ||
) -> int: | ||
""" | ||
Write details of file references proposed for conversion to file | ||
""" | ||
|
||
convertible_file_references = get_convertible_file_references(accession_id) | ||
|
||
n_proposal_candidates = len(convertible_file_references) | ||
indicies_to_select = select_indicies(n_proposal_candidates, max_items) | ||
|
||
if append: | ||
open_text_mode = "a" | ||
else: | ||
open_text_mode = "w" | ||
|
||
lines = [ | ||
"\t".join( | ||
[ | ||
convertible_file_references[i]["accession_id"], | ||
f"{convertible_file_references[i]['study_uuid']}", | ||
convertible_file_references[i]["name"], | ||
f"{convertible_file_references[i]['uuid']}", | ||
f"{convertible_file_references[i]['size_in_bytes']}", | ||
convertible_file_references[i]["size_human_readable"], | ||
] | ||
) | ||
for i in indicies_to_select | ||
] | ||
with output_path.open(open_text_mode) as fid: | ||
# If we are at start of file write header. | ||
if fid.tell() == 0: | ||
fid.writelines( | ||
"\t".join( | ||
[ | ||
"accession_id", | ||
"study_uuid", | ||
"name", | ||
"file_reference_uuid", | ||
"size_in_bytes", | ||
"size_human_readable", | ||
] | ||
) | ||
) | ||
fid.writelines("\n") | ||
fid.writelines("\n".join(lines)) | ||
# Write a new line so next append starts on next line | ||
fid.writelines("\n") | ||
|
||
return len(indicies_to_select) | ||
|
||
|
||
def read_proposals(proposal_path: Path) -> List[Dict]: | ||
"""Read proposals from a tab-separated file | ||
Returns a list of dicts containing file reference info | ||
""" | ||
proposals = [] | ||
with proposal_path.open('r') as f: | ||
# Skip header | ||
next(f) | ||
for line in f: | ||
if not line.strip(): | ||
continue | ||
accession_id, study_uuid, name, file_ref_uuid, size, human_size = line.strip().split('\t') | ||
proposals.append({ | ||
'accession_id': accession_id, | ||
'study_uuid': study_uuid, | ||
'name': name, | ||
'uuid': file_ref_uuid, | ||
'size_in_bytes': int(size), | ||
'size_human_readable': human_size | ||
}) | ||
return proposals |
14 changes: 14 additions & 0 deletions
14
bia-assign-image/bia_assign_image/resources/bioformats_curated_file_formats_readme.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
Curated files last updated 21/05/2024 | ||
|
||
Notes on creating the curated files: | ||
|
||
1. Copy supported formats from https://bio-formats.readthedocs.io/en/stable/supported-formats.html | ||
2. Paste into spreadsheet or text editor and get all extensions | ||
3. Ensure extensions are unique and sorted | ||
4. Manually curate into 'bioformats_curated_single_file_formats.txt' Which have 1-2-1 conversion with bioformats2raw and bioformats_curated_other_file_formats.txt which require more input for conversion (e.g. pattern files) | ||
|
||
The above steps can be accomplished in a browser developer console using the following js snippet (thanks to LA): | ||
|
||
[...new Set(Array.from(document.getElementsByTagName("tbody")[0].querySelectorAll("td:nth-child(2)")).map(el => el.innerText.split(",")).flat().filter(extension => extension.length))].sort() | ||
|
||
TODO: write python version of js snippet e.g. using selinium |
Oops, something went wrong.