Skip to content

Commit

Permalink
Propose only from file refs of datasets with biosample, image acquisi…
Browse files Browse the repository at this point in the history
…tion and specimen prep protocols
  • Loading branch information
kbab committed Feb 13, 2025
1 parent ecbb253 commit ed31c18
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 36 deletions.
14 changes: 12 additions & 2 deletions bia-assign-image/bia_assign_image/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def assign_from_proposal(
for p in proposals:
accession_id = p["accession_id"]
file_reference_uuids = [
p["uuid"],
p["file_reference_uuid"],
]
try:
image_uuid = assign(
Expand Down Expand Up @@ -292,11 +292,21 @@ def propose_images(
append: Annotated[
bool, typer.Option(help="Append to existing file instead of overwriting")
] = True,
check_image_creation_prerequisites: Annotated[
bool,
typer.Option(
help="Check whether dataset linked to file reference contains requirements needed to create a bia_data_model Image object."
),
] = True,
) -> None:
"""Propose file references to convert for the given accession IDs"""
for accession_id in accession_ids:
count = propose.write_convertible_file_references_for_accession_id(
accession_id, output_path, max_items=max_items, append=append
accession_id,
output_path,
max_items=max_items,
check_image_creation_prerequisites=check_image_creation_prerequisites,
append=append,
)
logger.info(f"Wrote {count} proposals for {accession_id} to {output_path}")

Expand Down
114 changes: 83 additions & 31 deletions bia-assign-image/bia_assign_image/propose.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,53 @@

import math
import random
from typing import List, Dict
from typing import List, Dict, Any
from pathlib import Path
import csv
from bia_shared_datamodels import semantic_models, bia_data_model
from bia_assign_image.config import api_client
from bia_assign_image.utils import in_bioformats_single_file_formats_list


# TODO: This function was copied from cli.py - should it be in a common place? Do other subpackages of bia_integrator need it?
def get_value_from_attribute_list(
attribute_list: List[semantic_models.Attribute],
attribute_name: str,
default: Any = [],
) -> Any:
"""Get the value of named attribute from a list of attributes"""

# Assumes attribute.value is a Dict
return next(
(
attribute.value[attribute_name]
for attribute in attribute_list
if attribute.name == attribute_name
),
default,
)


def dataset_has_image_creation_prerequisites(dataset: bia_data_model.Dataset) -> bool:
"""Assume we need biosample, image acquisition and specimen preparation protocols"""

image_acquisition_protocol_uuid = get_value_from_attribute_list(
dataset.attribute, "image_acquisition_protocol_uuid"
)
image_preparation_protocol_uuid = get_value_from_attribute_list(
dataset.attribute, "specimen_imaging_preparation_protocol_uuid"
)
bio_sample_uuid = get_value_from_attribute_list(
dataset.attribute, "bio_sample_uuid"
)
image_pre_requisites = [
len(image_acquisition_protocol_uuid),
len(image_preparation_protocol_uuid),
len(bio_sample_uuid),
]
return all(image_pre_requisites)


def select_indicies(n_indicies: int, n_to_select: int = 5) -> list[int]:
"""Select a number of indicies from input list
Expand Down Expand Up @@ -63,10 +104,12 @@ def sizeof_fmt(num, suffix="B"):
return f"{num:.1f}Yi{suffix}"


def get_convertible_file_references(accession_id: str) -> List[Dict]:
def get_convertible_file_references(
accession_id: str, check_image_creation_prerequisites: bool = True
) -> List[Dict]:
"""Get details of convertible images for given accession ID"""

# ToDo: Fix this to recursively call using until all data returned
# TODO: Fix this to recursively call using until all data returned
PAGE_SIZE_DEFAULT = 10000000

study = api_client.search_study_by_accession(accession_id)
Expand All @@ -75,26 +118,36 @@ def get_convertible_file_references(accession_id: str) -> List[Dict]:
datasets = api_client.get_dataset_linking_study(
study.uuid, page_size=PAGE_SIZE_DEFAULT
)

file_references = []
convertible_file_references = []

for dataset in datasets:
if check_image_creation_prerequisites:
if not dataset_has_image_creation_prerequisites(dataset):
continue

file_references.extend(
api_client.get_file_reference_linking_dataset(
dataset.uuid, PAGE_SIZE_DEFAULT
)
)

convertible_file_references = [
{
"accession_id": accession_id,
"study_uuid": study.uuid,
"name": fr.file_path,
"uuid": fr.uuid,
"size_in_bytes": fr.size_in_bytes,
"size_human_readable": sizeof_fmt(fr.size_in_bytes),
}
for fr in file_references
if in_bioformats_single_file_formats_list(fr.file_path)
]
convertible_file_references.extend(
[
{
"accession_id": accession_id,
"study_uuid": study.uuid,
"dataset_uuid": dataset.uuid,
"name": fr.file_path,
"uuid": fr.uuid,
"size_in_bytes": fr.size_in_bytes,
"size_human_readable": sizeof_fmt(fr.size_in_bytes),
}
for fr in file_references
if in_bioformats_single_file_formats_list(fr.file_path)
]
)

convertible_file_references = sorted(
convertible_file_references,
Expand All @@ -109,12 +162,15 @@ def write_convertible_file_references_for_accession_id(
output_path: Path,
max_items: int = 5,
append: bool = True,
check_image_creation_prerequisites: bool = True,
) -> int:
"""
Write details of file references proposed for conversion to file
"""

convertible_file_references = get_convertible_file_references(accession_id)
convertible_file_references = get_convertible_file_references(
accession_id, check_image_creation_prerequisites
)

n_proposal_candidates = len(convertible_file_references)
indicies_to_select = select_indicies(n_proposal_candidates, max_items)
Expand All @@ -129,6 +185,7 @@ def write_convertible_file_references_for_accession_id(
[
convertible_file_references[i]["accession_id"],
f"{convertible_file_references[i]['study_uuid']}",
f"{convertible_file_references[i]['dataset_uuid']}",
convertible_file_references[i]["name"],
f"{convertible_file_references[i]['uuid']}",
f"{convertible_file_references[i]['size_in_bytes']}",
Expand All @@ -145,6 +202,7 @@ def write_convertible_file_references_for_accession_id(
[
"accession_id",
"study_uuid",
"dataset_uuid",
"name",
"file_reference_uuid",
"size_in_bytes",
Expand All @@ -162,23 +220,17 @@ def write_convertible_file_references_for_accession_id(

def read_proposals(proposal_path: Path) -> List[Dict]:
"""Read proposals from a tab-separated file
Returns a list of dicts containing file reference info
"""

proposals = []
with proposal_path.open('r') as f:
# Skip header
next(f)
for line in f:
if not line.strip():
with proposal_path.open("r", newline="") as f:
reader = csv.DictReader(f, delimiter="\t") # Uses first line as field names
for row in reader:
if not row["accession_id"]: # Skip empty lines
continue
accession_id, study_uuid, name, file_ref_uuid, size, human_size = line.strip().split('\t')
proposals.append({
'accession_id': accession_id,
'study_uuid': study_uuid,
'name': name,
'uuid': file_ref_uuid,
'size_in_bytes': int(size),
'size_human_readable': human_size
})
row["size_in_bytes"] = int(row["size_in_bytes"]) # Convert size to int
proposals.append(row)

return proposals
1 change: 1 addition & 0 deletions bia-converter/scripts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
set_local_env.sh
5 changes: 2 additions & 3 deletions bia-converter/scripts/assign_and_convert_images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@ fi

# Put these in a set_environment.sh file and source
bia_assign_image_dir=../../bia-assign-image
bia_converter_light_dir=../../bia-converter-light
bia_converter_dir=../

update_example_image_uri_script_path=$(realpath update_example_image_uri_for_dataset.py)
# Create proposals if the location of a proposals file was not specified
if [ -z "$propose_images_output" ]; then
propose_images_output="$artefact_dir_base/propose_$accession_id.tsv"
Expand Down Expand Up @@ -62,7 +61,7 @@ do
eval $command

static_display_uuid=$(grep -oP 'Created STATIC_DISPLAY image representation with uuid: \K[0-9a-fA-F-]+' $convert_to_static_display_output)
command="poetry --directory $bia_converter_dir run python scripts/update_example_image_uri_for_dataset.py --update-mode replace $static_display_uuid"
command="poetry --directory $bia_converter_dir run python $update_example_image_uri_script_path --update-mode replace $static_display_uuid"
echo $command
eval $command
fi
Expand Down

0 comments on commit ed31c18

Please sign in to comment.