Skip to content

Commit

Permalink
Export update command (#308)
Browse files Browse the repository at this point in the history
* remove unnecessary caching mechanism

* added export options

* added logic for update options and expanded study ordering testing

* updated comments

* nested function to make tests easier to read

* added comments to tests

* removed cache data

* updated readme

* remove old incorrect readme info

* add info about docker installation

* fix default pathing when running the website all command
  • Loading branch information
sherwoodf authored Feb 17, 2025
1 parent cc0c5c9 commit 889a0a7
Show file tree
Hide file tree
Showing 8 changed files with 312 additions and 4,630 deletions.
23 changes: 12 additions & 11 deletions bia-export/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ To test that installation has worked correctly, you can run:

poetry run bia-export website all S-BIADTEST -r test/input_data

With docker daemon/desktop running, in the root of this package (bia-integrator/) run:
To run all the tests, docker needs to be installed.

make bia-export.test

Usage
-----
Expand All @@ -34,6 +33,16 @@ Run:

This will create 3 files (bia-study-metadata.json, bia-image-metadata.json, bia-dataset-metadata-for-images.json), which can replace the files of the same name in the data directory of the astro package to genereate new study pages.


For all website commands, there is the option to produce updated jsons using the -u otion to point to either a directory with all 3 jsons (for the website all option) or the specific file for the more specific commands below. E.g.

poetry run bia-export website all S-BIAD830 -u ./path/to/existing/astro/jsons

will created updated jsons from all the jsons stored in that folder using the information for S-BIAD830 in the API (adding or replacing existing information about that study)

Note that setting -o (output) and -u (files to update) paths to be different is recommend to avoid directly writing over files, though overwrite behaviour does work.

-----
### Study export for website

Used to create jsons which start at study objects and follow paths to all related objects that we want to display on a single study page of the website.
Expand All @@ -48,14 +57,6 @@ Note that with -r (root directory) - local files will be used to generate the ex

If no Accession ID or UUID is passed, all studies will be processed (either based on all studies in the <root-folder>/study/ directory, or by querying for studies in the api). The studies are exported in order of release date.

The two points above hold for all export commands for the api. For the website-study export only, there is a optional cache in order to avoid processing all file references every time an export is performed (as this slows down export a lot). E.g. running:


poetry run bia-export website study -o bia-study-metadata.json -c read_cache


Will export all studies using the cached aggregation (when avaliable) as the counts for images, files, and the list of different file types.

----

### Image export for website
Expand Down Expand Up @@ -86,7 +87,7 @@ Running tests

Requires a locally running api in a docker container. This can be started by running:

docker compose up --build --force-recreate --remove-orphans -d
docker compose up --build --force-recreate --remove-orphans -d --wait

and then running tests, e.g.:

Expand Down
162 changes: 142 additions & 20 deletions bia-export/bia_export/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from rich.logging import RichHandler
from typing_extensions import Annotated
from pathlib import Path
from bia_export.website_export.export_all import get_study_ids
from bia_export.website_export.export_all import get_study_ids, study_sort_key
from .website_export.studies.transform import transform_study
from .website_export.studies.models import StudyCLIContext, CacheUse
from .website_export.images.transform import transform_images
Expand All @@ -16,6 +16,7 @@
from .bia_client import api_client
import json
from .settings import Settings
import os

logging.basicConfig(
level="NOTSET", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()]
Expand Down Expand Up @@ -45,32 +46,74 @@ def generate_all(
help="If root directory specified then use files there, rather than calling API",
),
] = None,
cache: Annotated[
Optional[CacheUse],
typer.Option(
"--cache",
"-c",
),
] = None,
output_directory: Annotated[
Optional[Path],
typer.Option(
"--out_dir",
"-o",
),
] = None,
update_path: Annotated[
Optional[Path],
typer.Option(
"--update_path",
"-u",
help="If update path specified then update the files there (assuming files have the default naming)",
),
] = None,
):
validate_cli_inputs(id_list=id_list, update_path=update_path)

settings = Settings()

if not id_list:
id_list = get_study_ids(root_directory)

logger.info("Exporting study pages")
website_study(id_list=id_list, root_directory=root_directory, cache=cache, output_filename=(output_directory / DEFAULT_WEBSITE_STUDY_FILE_NAME if output_directory else None))
website_study(
id_list=id_list,
root_directory=root_directory,
output_filename=(
output_directory / DEFAULT_WEBSITE_STUDY_FILE_NAME
if output_directory
else Path(DEFAULT_WEBSITE_STUDY_FILE_NAME)
),
update_file=(
update_path / DEFAULT_WEBSITE_STUDY_FILE_NAME
if update_path
else None
)
)
logger.info("Exporting image pages")
website_image(id_list=id_list, root_directory=root_directory, output_filename=(output_directory / DEFAULT_WEBSITE_IMAGE_FILE_NAME if output_directory else None))
website_image(
id_list=id_list,
root_directory=root_directory,
output_filename=(
output_directory / DEFAULT_WEBSITE_IMAGE_FILE_NAME
if output_directory
else Path(DEFAULT_WEBSITE_IMAGE_FILE_NAME)
),
update_file=(
update_path / DEFAULT_WEBSITE_IMAGE_FILE_NAME
if update_path
else None
)
)
logger.info("Exporting datasets for study pages")
datasets_for_website_image(id_list=id_list, root_directory=root_directory, output_filename=(output_directory / DEFAULT_WEBSITE_DATASET_FOR_IMAGE_FILE_NAME if output_directory else None))
datasets_for_website_image(
id_list=id_list,
root_directory=root_directory,
output_filename=(
output_directory / DEFAULT_WEBSITE_DATASET_FOR_IMAGE_FILE_NAME
if output_directory
else Path(DEFAULT_WEBSITE_DATASET_FOR_IMAGE_FILE_NAME)
),
update_file=(
update_path / DEFAULT_WEBSITE_DATASET_FOR_IMAGE_FILE_NAME
if update_path
else None
)
)


@website.command("study")
Expand All @@ -93,29 +136,41 @@ def website_study(
help="If root directory specified then use files there, rather than calling API",
),
] = None,
cache: Annotated[
Optional[CacheUse],
update_file: Annotated[
Optional[Path],
typer.Option(
"--cache",
"-c",
"--update_file",
"-u",
help="If update file specified then update the file with studies provided.",
),
] = None,
):
settings = Settings()

validate_cli_inputs(id_list=id_list, update_file=update_file)

settings = Settings()

if not id_list:
id_list = get_study_ids(root_directory)

studies_map = {}

if update_file:
studies_map |= file_data_to_update(update_file)

for id in id_list:
context = create_cli_context(StudyCLIContext, id, root_directory, cache)
context = create_cli_context(StudyCLIContext, id, root_directory)
study = transform_study(context)
studies_map[study.accession_id] = study.model_dump(mode="json")

if id_list:
sorted_map = dict(sorted(studies_map.items(), key=lambda item_tuple: study_sort_key(item_tuple[1]), reverse=True))
else:
sorted_map = studies_map

logging.info(f"Writing study info to {output_filename.absolute()}")
with open(output_filename, "w") as output:
output.write(json.dumps(studies_map, indent=4))
output.write(json.dumps(sorted_map, indent=4))


@website.command("image")
Expand All @@ -138,14 +193,29 @@ def website_image(
help="If root directory specified then use files there, rather than calling API",
),
] = None,
update_file: Annotated[
Optional[Path],
typer.Option(
"--update_file",
"-u",
help="If update file specified then update the file with studies provided.",
),
] = None,
):
validate_cli_inputs(id_list=id_list, update_file=update_file)

settings = Settings()

if not id_list:
id_list = get_study_ids(root_directory)

image_map = {}
if update_file:
image_map |= file_data_to_update(update_file)

for id in id_list:
context = create_cli_context(ImageCLIContext, id, root_directory)
image_map = image_map | transform_images(context)
image_map |= transform_images(context)

logging.info(f"Writing website images to {output_filename.absolute()}")
with open(output_filename, "w") as output:
Expand All @@ -172,18 +242,33 @@ def datasets_for_website_image(
help="If root directory specified then use files there, rather than calling API",
),
] = None,
update_file: Annotated[
Optional[Path],
typer.Option(
"--update_file",
"-u",
help="If update file specified then update the file with studies provided.",
),
] = None,
):

validate_cli_inputs(id_list=id_list, update_file=update_file)

settings = Settings()

if not id_list:
id_list = get_study_ids(root_directory)

dataset_map = {}
if update_file:
dataset_map |= file_data_to_update(update_file)

for id in id_list:
context = create_cli_context(CLIContext, id, root_directory)
dataset_map = dataset_map | transform_datasets(context)
dataset_map |= transform_datasets(context)

logging.info(f"Writing datasets for images to {output_filename.absolute()}")

with open(output_filename, "w") as output:
output.write(json.dumps(dataset_map, indent=4))

Expand Down Expand Up @@ -229,5 +314,42 @@ def get_uuid_from_accession_id(accession_id: str) -> str:
)


def validate_cli_inputs(
id_list: Optional[List[str]] = None,
update_file: Optional[Path] = None,
update_path: Optional[Path] = None,
output_file: Optional[Path] = None,
output_path: Optional[Path] = None,
):

if (update_path or update_file) and not id_list:
raise ValueError(
"Study IDs must be specified if export website commands are being used in update mode"
)

if update_path:
if not os.path.isdir(update_path):
raise NotADirectoryError(
f"Update path: {update_path}, needs to be the directory containing expected files to update."
)

if update_file and not os.path.isfile(update_file):
raise FileNotFoundError(
f"Update path: {update_path}, needs to be the file to update."
)

if output_file and update_file and (output_file.resolve() == update_file.resolve()):
logger.warning("this is weird")

if output_path and update_path and (output_path.resolve() == update_path.resolve()):
logger.warning("this is weird")


def file_data_to_update(file_path: Path) -> Optional[dict]:
data = None
with open(file_path, 'r') as f:
data: dict = json.load(f)
return data

if __name__ == "__main__":
app()
49 changes: 29 additions & 20 deletions bia-export/bia_export/website_export/export_all.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
from glob import glob
from pathlib import Path
from typing import Optional
from typing import Optional, Union
from bia_export.bia_client import api_client
from bia_integrator_api.models import Study
from bia_export.website_export.studies.models import Study as exportStudy
from bia_integrator_api.models import Study as apiStudy
from .generic_object_retrieval import read_api_json_file
import logging
import re

logger = logging.getLogger("__main__." + __name__)


def find_local_studies(root_path: Path) -> list[Study]:
def find_local_studies(root_path: Path) -> list[apiStudy]:
study_search_path = root_path.joinpath("study", "**/*.json")
file_paths = glob(str(study_search_path), recursive=True)
studies = []
for file_path in file_paths:
studies.append(read_api_json_file(file_path, Study))
studies.append(read_api_json_file(file_path, apiStudy))
return studies


def fetch_studies_from_api(
page_size: int, agregator_list: list[Study] = None
) -> list[Study]:
page_size: int, agregator_list: list[apiStudy] = None
) -> list[apiStudy]:
if not agregator_list:
agregator_list = []
start_uuid = None
Expand All @@ -40,23 +41,31 @@ def fetch_studies_from_api(


def get_study_ids(root_directory: Optional[Path] = None):
def get_accno(acc_id):
match = re.search(r"\d+$", acc_id)
return int(match.group()) if match else None

if root_directory:
studies_list = find_local_studies(root_directory)
sorted_studies = sorted(
studies_list,
key=lambda study: (study.release_date, get_accno(study.accession_id)),
reverse=True,
)
sorted_studies = sort_studies(studies_list)
return [study.accession_id for study in sorted_studies]
else:
studies_list = fetch_studies_from_api(page_size=100)
sorted_studies = sorted(
studies_list,
key=lambda study: (study.release_date, get_accno(study.accession_id)),
reverse=True,
)
sorted_studies = sort_studies(studies_list)
return [study.uuid for study in sorted_studies]


def study_sort_key(study: Union[apiStudy, exportStudy, dict]) -> tuple[str, str]:
def get_accno(acc_id):
match = re.search(r"\d+$", acc_id)
return int(match.group()) if match else None

if isinstance(study, (apiStudy, exportStudy)):
study = study.model_dump()

return (study["release_date"], get_accno(study["accession_id"]))

def sort_studies(studies_list: list[Union[apiStudy, exportStudy]]):
sorted_studies = sorted(
studies_list,
key=lambda study: study_sort_key(study),
reverse=True,
)

return sorted_studies
Loading

0 comments on commit 889a0a7

Please sign in to comment.