Export update command (#308)

* remove unnecessary caching mechanism * added export options * added logic for update options and expanded study ordering testing * updated comments * nested function to make tests easier to read * added comments to tests * removed cache data * updated readme * remove old incorrect readme info * add info about docker installation * fix default pathing when running the website all command
BioImage-Archive · Feb 17, 2025 · 889a0a7 · 889a0a7
1 parent cc0c5c9
commit 889a0a7
Show file tree

Hide file tree

Showing 8 changed files with 312 additions and 4,630 deletions.
diff --git a/bia-export/README.md b/bia-export/README.md
@@ -18,9 +18,8 @@ To test that installation has worked correctly, you can run:
 
     poetry run bia-export website all S-BIADTEST -r test/input_data
 
-With docker daemon/desktop running, in the root of this package (bia-integrator/) run:
+To run all the tests, docker needs to be installed.
 
-    make bia-export.test
 
 Usage
 -----
@@ -34,6 +33,16 @@ Run:
 
 This will create 3 files (bia-study-metadata.json, bia-image-metadata.json, bia-dataset-metadata-for-images.json), which can replace the files of the same name in the data directory of the astro package to genereate new study pages.
 
+
+For all website commands, there is the option to produce updated jsons using the -u otion to point to either a directory with all 3 jsons (for the website all option) or the specific file for the more specific commands below. E.g.
+
+    poetry run bia-export website all S-BIAD830 -u ./path/to/existing/astro/jsons
+
+will created updated jsons from all the jsons stored in that folder using the information for S-BIAD830 in the API (adding or replacing existing information about that study)
+
+Note that setting -o (output) and -u (files to update) paths to be different is recommend to avoid directly writing over files, though overwrite behaviour does work.
+
+-----
 ### Study export for website 
 
 Used to create jsons which start at study objects and follow paths to all related objects that we want to display on a single study page of the website.
@@ -48,14 +57,6 @@ Note that with -r (root directory) - local files will be used to generate the ex
 
 If no Accession ID or UUID is passed, all studies will be processed (either based on all studies in the <root-folder>/study/ directory, or by querying for studies in the api). The studies are exported in order of release date. 
 
-The two points above hold for all export commands for the api. For the website-study export only, there is a optional cache in order to avoid processing all file references every time an export is performed (as this slows down export a lot). E.g. running:
-
-
-    poetry run bia-export website study -o bia-study-metadata.json -c read_cache
-
-
-Will export all studies using the cached aggregation (when avaliable) as the counts for images, files, and the list of different file types.
-
 ----
 
 ### Image export for website
@@ -86,7 +87,7 @@ Running tests
 
 Requires a locally running api in a docker container. This can be started by running:
 
-    docker compose up --build --force-recreate --remove-orphans -d
+    docker compose up --build --force-recreate --remove-orphans -d --wait
 
 and then running tests, e.g.:
 

diff --git a/bia-export/bia_export/cli.py b/bia-export/bia_export/cli.py
@@ -4,7 +4,7 @@
 from rich.logging import RichHandler
 from typing_extensions import Annotated
 from pathlib import Path
-from bia_export.website_export.export_all import get_study_ids
+from bia_export.website_export.export_all import get_study_ids, study_sort_key
 from .website_export.studies.transform import transform_study
 from .website_export.studies.models import StudyCLIContext, CacheUse
 from .website_export.images.transform import transform_images
@@ -16,6 +16,7 @@
 from .bia_client import api_client
 import json
 from .settings import Settings
+import os
 
 logging.basicConfig(
     level="NOTSET", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()]
@@ -45,32 +46,74 @@ def generate_all(
             help="If root directory specified then use files there, rather than calling API",
         ),
     ] = None,
-    cache: Annotated[
-        Optional[CacheUse],
-        typer.Option(
-            "--cache",
-            "-c",
-        ),
-    ] = None,
     output_directory: Annotated[
         Optional[Path],
         typer.Option(
             "--out_dir",
             "-o",
         ),
     ] = None,
+    update_path: Annotated[
+        Optional[Path],
+        typer.Option(
+            "--update_path",
+            "-u",
+            help="If update path specified then update the files there (assuming files have the default naming)",
+        ),
+    ] = None,
 ):
+    validate_cli_inputs(id_list=id_list, update_path=update_path)
+
     settings = Settings()
 
     if not id_list:
         id_list = get_study_ids(root_directory)
 
     logger.info("Exporting study pages")
-    website_study(id_list=id_list, root_directory=root_directory, cache=cache, output_filename=(output_directory / DEFAULT_WEBSITE_STUDY_FILE_NAME if output_directory else None))
+    website_study(
+        id_list=id_list,
+        root_directory=root_directory,
+        output_filename=(
+            output_directory / DEFAULT_WEBSITE_STUDY_FILE_NAME
+            if output_directory
+            else Path(DEFAULT_WEBSITE_STUDY_FILE_NAME)
+        ),
+        update_file=(
+            update_path / DEFAULT_WEBSITE_STUDY_FILE_NAME
+            if update_path
+            else None
+        )
+    )
     logger.info("Exporting image pages")
-    website_image(id_list=id_list, root_directory=root_directory, output_filename=(output_directory / DEFAULT_WEBSITE_IMAGE_FILE_NAME if output_directory else None))
+    website_image(
+        id_list=id_list,
+        root_directory=root_directory,
+        output_filename=(
+            output_directory / DEFAULT_WEBSITE_IMAGE_FILE_NAME
+            if output_directory
+            else Path(DEFAULT_WEBSITE_IMAGE_FILE_NAME)
+        ),
+        update_file=(
+            update_path / DEFAULT_WEBSITE_IMAGE_FILE_NAME
+            if update_path
+            else None
+        )
+    )
     logger.info("Exporting datasets for study pages")
-    datasets_for_website_image(id_list=id_list, root_directory=root_directory, output_filename=(output_directory / DEFAULT_WEBSITE_DATASET_FOR_IMAGE_FILE_NAME if output_directory else None))
+    datasets_for_website_image(
+        id_list=id_list,
+        root_directory=root_directory,
+        output_filename=(
+            output_directory / DEFAULT_WEBSITE_DATASET_FOR_IMAGE_FILE_NAME
+            if output_directory
+            else Path(DEFAULT_WEBSITE_DATASET_FOR_IMAGE_FILE_NAME)
+        ),
+        update_file=(
+            update_path / DEFAULT_WEBSITE_DATASET_FOR_IMAGE_FILE_NAME
+            if update_path
+            else None
+        )
+    )
 
 
 @website.command("study")
@@ -93,29 +136,41 @@ def website_study(
             help="If root directory specified then use files there, rather than calling API",
         ),
     ] = None,
-    cache: Annotated[
-        Optional[CacheUse],
+    update_file: Annotated[
+        Optional[Path],
         typer.Option(
-            "--cache",
-            "-c",
+            "--update_file",
+            "-u",
+            help="If update file specified then update the file with studies provided.",
         ),
     ] = None,
 ):
-    settings = Settings()
 
+    validate_cli_inputs(id_list=id_list, update_file=update_file)
+
+    settings = Settings()
 
     if not id_list:
         id_list = get_study_ids(root_directory)
 
     studies_map = {}
+
+    if update_file:
+        studies_map |= file_data_to_update(update_file)    
+
     for id in id_list:
-        context = create_cli_context(StudyCLIContext, id, root_directory, cache)
+        context = create_cli_context(StudyCLIContext, id, root_directory)
         study = transform_study(context)
         studies_map[study.accession_id] = study.model_dump(mode="json")
 
+    if id_list:
+        sorted_map = dict(sorted(studies_map.items(), key=lambda item_tuple: study_sort_key(item_tuple[1]), reverse=True))
+    else:
+        sorted_map = studies_map
+
     logging.info(f"Writing study info to {output_filename.absolute()}")
     with open(output_filename, "w") as output:
-        output.write(json.dumps(studies_map, indent=4))
+        output.write(json.dumps(sorted_map, indent=4))
 
 
 @website.command("image")
@@ -138,14 +193,29 @@ def website_image(
             help="If root directory specified then use files there, rather than calling API",
         ),
     ] = None,
+    update_file: Annotated[
+        Optional[Path],
+        typer.Option(
+            "--update_file",
+            "-u",
+            help="If update file specified then update the file with studies provided.",
+        ),
+    ] = None,
 ):
+    validate_cli_inputs(id_list=id_list, update_file=update_file)
+
+    settings = Settings()
+
     if not id_list:
         id_list = get_study_ids(root_directory)
 
     image_map = {}
+    if update_file:
+        image_map |= file_data_to_update(update_file)
+
     for id in id_list:
         context = create_cli_context(ImageCLIContext, id, root_directory)
-        image_map = image_map | transform_images(context)
+        image_map |= transform_images(context)
 
     logging.info(f"Writing website images to {output_filename.absolute()}")
     with open(output_filename, "w") as output:
@@ -172,18 +242,33 @@ def datasets_for_website_image(
             help="If root directory specified then use files there, rather than calling API",
         ),
     ] = None,
+    update_file: Annotated[
+        Optional[Path],
+        typer.Option(
+            "--update_file",
+            "-u",
+            help="If update file specified then update the file with studies provided.",
+        ),
+    ] = None,
 ):
+
+    validate_cli_inputs(id_list=id_list, update_file=update_file)
+
     settings = Settings()
 
     if not id_list:
         id_list = get_study_ids(root_directory)
 
     dataset_map = {}
+    if update_file:
+        dataset_map |= file_data_to_update(update_file)
+
     for id in id_list:
         context = create_cli_context(CLIContext, id, root_directory)
-        dataset_map = dataset_map | transform_datasets(context)
+        dataset_map |= transform_datasets(context)
 
     logging.info(f"Writing datasets for images to {output_filename.absolute()}")
+
     with open(output_filename, "w") as output:
         output.write(json.dumps(dataset_map, indent=4))
 
@@ -229,5 +314,42 @@ def get_uuid_from_accession_id(accession_id: str) -> str:
         )
 
 
+def validate_cli_inputs(
+    id_list: Optional[List[str]] = None,
+    update_file: Optional[Path] = None,
+    update_path: Optional[Path] = None,
+    output_file: Optional[Path] = None,
+    output_path: Optional[Path] = None,
+):
+
+    if (update_path or update_file) and not id_list:
+        raise ValueError(
+            "Study IDs must be specified if export website commands are being used in update mode"
+        )
+
+    if update_path:
+        if not os.path.isdir(update_path):
+            raise NotADirectoryError(
+                f"Update path: {update_path}, needs to be the directory containing expected files to update."
+            )
+
+    if update_file and not os.path.isfile(update_file):
+        raise FileNotFoundError(
+            f"Update path: {update_path}, needs to be the file to update."
+        )
+
+    if output_file and update_file and (output_file.resolve() == update_file.resolve()):
+        logger.warning("this is weird")
+
+    if output_path and update_path and (output_path.resolve() == update_path.resolve()):
+        logger.warning("this is weird")
+
+
+def file_data_to_update(file_path: Path) -> Optional[dict]:
+    data = None
+    with open(file_path, 'r') as f:
+        data: dict = json.load(f)
+    return data
+
 if __name__ == "__main__":
     app()
diff --git a/bia-export/bia_export/website_export/export_all.py b/bia-export/bia_export/website_export/export_all.py
@@ -1,27 +1,28 @@
 from glob import glob
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 from bia_export.bia_client import api_client
-from bia_integrator_api.models import Study
+from bia_export.website_export.studies.models import Study as exportStudy
+from bia_integrator_api.models import Study as apiStudy
 from .generic_object_retrieval import read_api_json_file
 import logging
 import re
 
 logger = logging.getLogger("__main__." + __name__)
 
 
-def find_local_studies(root_path: Path) -> list[Study]:
+def find_local_studies(root_path: Path) -> list[apiStudy]:
     study_search_path = root_path.joinpath("study", "**/*.json")
     file_paths = glob(str(study_search_path), recursive=True)
     studies = []
     for file_path in file_paths:
-        studies.append(read_api_json_file(file_path, Study))
+        studies.append(read_api_json_file(file_path, apiStudy))
     return studies
 
 
 def fetch_studies_from_api(
-    page_size: int, agregator_list: list[Study] = None
-) -> list[Study]:
+    page_size: int, agregator_list: list[apiStudy] = None
+) -> list[apiStudy]:
     if not agregator_list:
         agregator_list = []
         start_uuid = None
@@ -40,23 +41,31 @@ def fetch_studies_from_api(
 
 
 def get_study_ids(root_directory: Optional[Path] = None):
-    def get_accno(acc_id):
-        match = re.search(r"\d+$", acc_id)
-        return int(match.group()) if match else None
-
     if root_directory:
         studies_list = find_local_studies(root_directory)
-        sorted_studies = sorted(
-            studies_list,
-            key=lambda study: (study.release_date, get_accno(study.accession_id)),
-            reverse=True,
-        )
+        sorted_studies = sort_studies(studies_list)
         return [study.accession_id for study in sorted_studies]
     else:
         studies_list = fetch_studies_from_api(page_size=100)
-        sorted_studies = sorted(
-            studies_list,
-            key=lambda study: (study.release_date, get_accno(study.accession_id)),
-            reverse=True,
-        )
+        sorted_studies = sort_studies(studies_list)
         return [study.uuid for study in sorted_studies]
+
+
+def study_sort_key(study: Union[apiStudy, exportStudy, dict]) -> tuple[str, str]:
+    def get_accno(acc_id):
+        match = re.search(r"\d+$", acc_id)
+        return int(match.group()) if match else None
+
+    if isinstance(study, (apiStudy, exportStudy)):
+        study = study.model_dump()
+
+    return (study["release_date"], get_accno(study["accession_id"]))
+
+def sort_studies(studies_list: list[Union[apiStudy, exportStudy]]):
+    sorted_studies = sorted(
+        studies_list,
+        key=lambda study: study_sort_key(study),
+        reverse=True,
+    )
+
+    return sorted_studies