Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ingest settings #309

Merged
merged 7 commits into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ jobs:
os: [ubuntu-20.04, macos-latest, windows-latest]
project:
[
bia-ingest,
bia-assign-image,
bia-shared-datamodels,
core,
Expand Down Expand Up @@ -50,7 +49,8 @@ jobs:
os: [ubuntu-20.04]
project:
[
bia-export,
bia-export,
bia-ingest,
]
runs-on: ${{ matrix.os }}
defaults:
Expand Down
13 changes: 9 additions & 4 deletions bia-ingest/.env_template
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
bia_data_dir=Root of directory to save/read models from disk if using persistence_mode=disk
bia_api_basepath=CREDENTIALS_FOR_LOCALHOST
bia_api_username=CREDENTIALS_FOR_LOCALHOST
bia_api_password=CREDENTIALS_FOR_LOCALHOST
# bia_data_dir=Root of directory to save/read models from disk if using persistence_mode=disk
local_bia_api_basepath="http://localhost:8080"
local_bia_api_username="[email protected]"
local_bia_api_password="test"
# Do NOT edit this file directly to put in your credentials, as we do not want to the information to be committed to git
# Insteaded create a copy called .env and update with your information if you want to use the actual api
bia_api_basepath="https://wwwdev.ebi.ac.uk/bioimage-archive/api"
bia_api_username=YOUR_CREDENTIALS_FOR_WWWDEV_API
bia_api_password=YOUR_CREDENTIALS_FOR_WWWDEV_API
11 changes: 10 additions & 1 deletion bia-ingest/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,16 @@ This creates the following structure (using S-BIAD325 as an example):
...
```

To ingest into the api, set the persistence-mode to `api`:
To ingest into your local api, first set up the local api with
```sh
$ docker compose up --wait --build -d
```
and run ingest with the persistence-mode to `local-api`:
```sh
$ poetry run biaingest ingest --persistence-mode local-api S-BIAD1285
```

To ingest to the wwwdev api (the main one currently in use), you need to have a user account on the dev API. You can then copy the .env_template file to .env, and fill in the details (email and password) of your user account. Then, to ingest, you can run:
```sh
$ poetry run biaingest ingest --persistence-mode api S-BIAD1285
```
Expand Down
44 changes: 44 additions & 0 deletions bia-ingest/bia_ingest/api_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from bia_ingest.settings import Settings
from bia_integrator_api.util import get_client_private
from bia_integrator_api import Configuration, ApiClient, exceptions
from bia_integrator_api.api import PrivateApi
import bia_integrator_api.models as api_models
import logging

settings = Settings()

logger = logging.getLogger("__main__." + __name__)


def get_bia_api_client():
private_api_client = get_client_private(
username=settings.bia_api_username,
password=settings.bia_api_password,
api_base_url=settings.bia_api_basepath,
)
return private_api_client


def get_local_bia_api_client():
api_config = Configuration(host=settings.local_bia_api_basepath)
private_api = PrivateApi(ApiClient(configuration=api_config))
try:
access_token = private_api.login_for_access_token(
username=settings.local_bia_api_username,
password=settings.local_bia_api_password,
)
except exceptions.UnauthorizedException:
private_api.register_user(
api_models.BodyRegisterUser(
email=settings.local_bia_api_username,
password_plain=settings.local_bia_api_password,
secret_token=settings.local_user_create_secret_token,
)
)
access_token = private_api.login_for_access_token(
username=settings.local_bia_api_username,
password=settings.local_bia_api_password,
)
assert access_token
api_config.access_token = access_token.access_token
return private_api
36 changes: 34 additions & 2 deletions bia-ingest/bia_ingest/biostudies/find_bia_studies.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
from datetime import date
import logging
from typing import Optional
from bia_integrator_api.util import get_client
from bia_integrator_api.models import Study
from bia_ingest.settings import settings
import re

logger = logging.getLogger("__main__." + __name__)

Expand Down Expand Up @@ -77,17 +81,44 @@ def get_processed_studies() -> list[str]:
return [acc_id.strip("\n") for acc_id in acc_ids]


def fetch_studies_from_api(
api_client, page_size: int, agregator_list: list[Study] = None
) -> list[Study]:
if not agregator_list:
agregator_list = []
start_uuid = None
else:
start_uuid = agregator_list[-1].uuid

fetched_studies = api_client.search_study(
page_size=page_size, start_from_uuid=start_uuid
)
agregator_list += fetched_studies

if len(fetched_studies) != page_size:
return agregator_list
else:
return fetch_studies_from_api(api_client, page_size, agregator_list)


def find_unprocessed_studies(output_file: Optional[pathlib.Path]):
def get_accno(acc_id):
match = re.search(r"\d+$", acc_id)
return int(match.group()) if match else None

page_size = 100
logging.info("Fetching all studies from biostudies")
imaging_studies = get_all_bia_studies(page_size)
grouped_studies = studies_by_source(imaging_studies)
studies_of_interest = (
grouped_studies["BIAD"] + grouped_studies["BSST"] + grouped_studies["other"]
)
acc_id_of_interest = [result.accession for result in studies_of_interest]
processed_acc_ids = get_processed_studies()
unprocessed_acc_ids = set(acc_id_of_interest) - set(processed_acc_ids)
logging.info("Fetching all studies from bia api")
api_client = get_client(settings.bia_api_basepath)
bia_existing_studies = fetch_studies_from_api(api_client, page_size)
processed_acc_ids = [str(study.accession_id) for study in bia_existing_studies]
unprocessed_acc_ids = sorted(list(set(acc_id_of_interest) - set(processed_acc_ids)), key=lambda acc_id : get_accno(acc_id))

if not output_file:
output_file = (
Expand All @@ -96,6 +127,7 @@ def find_unprocessed_studies(output_file: Optional[pathlib.Path]):
/ f"uningested_studies_of_interest_{str(date.today())}"
)

logging.info(f"Writing uninstest studies to: {output_file}")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo?

with open(output_file, "w") as f:
for id in unprocessed_acc_ids:
f.write(f"{id}\n")
25 changes: 1 addition & 24 deletions bia-ingest/bia_ingest/biostudies/generic_conversion_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import logging
from pathlib import Path
from typing import List, Any, Dict, Optional, Tuple, Union
from pydantic import BaseModel, ValidationError
from pydantic.alias_generators import to_snake
from bia_ingest.biostudies.submission_parsing_utils import (
find_sections_recursive,
mattributes_to_dict,
Expand All @@ -13,10 +11,8 @@
Submission,
Section,
)
from ..config import settings, api_client
from ..cli_logging import IngestionResult, log_failed_model_creation
import bia_integrator_api.models as api_models
from bia_shared_datamodels import attribute_models


logger = logging.getLogger("__main__." + __name__)

Expand Down Expand Up @@ -88,25 +84,6 @@ def get_generic_section_as_dict(
return return_dict


def persist(object_list: List[BaseModel], object_path: str, sumbission_accno: str):
if object_path == "api":
for obj in object_list:
api_creation_method = f"post_{to_snake(obj.model.type_name)}"
api_obj = getattr(api_models, obj.model.type_name).model_validate_json(
obj.model_dump_json()
)
getattr(api_client, api_creation_method)(api_obj)
logger.info(f"persisted {obj.uuid} to API")
else:
output_dir = Path(settings.bia_data_dir) / object_path / sumbission_accno
if not output_dir.is_dir():
output_dir.mkdir(parents=True)
logger.debug(f"Created {output_dir}")
for obj in object_list:
output_path = output_dir / f"{obj.uuid}.json"
output_path.write_text(obj.model_dump_json(indent=2))
logger.debug(f"Written {output_path}")


def object_value_pair_to_dict(
objects: List[Any], key_attr: str, value_attr: Optional[str]
Expand Down
17 changes: 7 additions & 10 deletions bia-ingest/bia_ingest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
Submission,
)
from bia_ingest.biostudies.generic_conversion_utils import attributes_to_dict
from bia_ingest.config import settings, api_client
#from bia_ingest.config import settings, api_client
#from bia_ingest.settings import settings

from bia_ingest.persistence_strategy import (
PersistenceMode,
persistence_strategy_factory,
Expand Down Expand Up @@ -63,13 +65,13 @@ def find_new_studies(
@app.command(help="Ingest from biostudies and echo json of bia_data_model.Study")
def ingest(
accession_id_list: Annotated[Optional[List[str]], typer.Argument()] = None,
input_file: Annotated[Optional[Path], typer.Option("--input_file", "-f")] = None,
input_file: Annotated[Optional[Path], typer.Option("--input-file", "-f")] = None,
persistence_mode: Annotated[
PersistenceMode, typer.Option(case_sensitive=False)
PersistenceMode, typer.Option("--persistence-mode", "-pm", case_sensitive=False)
] = PersistenceMode.disk,
verbose: Annotated[bool, typer.Option("--verbose", "-v")] = False,
process_filelist: Annotated[
ProcessFilelistMode, typer.Option(case_sensitive=False)
ProcessFilelistMode, typer.Option("--process-filelist", "-pf", case_sensitive=False)
] = ProcessFilelistMode.ask,
dryrun: Annotated[bool, typer.Option()] = False,
write_csv: Annotated[str, typer.Option()] = None,
Expand All @@ -91,12 +93,7 @@ def ingest(

persister = None
if not dryrun:
persister = persistence_strategy_factory(
persistence_mode,
output_dir_base=settings.bia_data_dir,
accession_id=accession_id,
api_client=api_client,
)
persister = persistence_strategy_factory(persistence_mode, accession_id=accession_id)

try:
# Get information from biostudies
Expand Down
57 changes: 0 additions & 57 deletions bia-ingest/bia_ingest/config.py

This file was deleted.

9 changes: 7 additions & 2 deletions bia-ingest/bia_ingest/persistence_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from bia_integrator_api.api.public_api import PublicApi
from bia_integrator_api.exceptions import NotFoundException
import bia_integrator_api.models as api_models
from bia_ingest.api_client import get_bia_api_client, get_local_bia_api_client
from bia_ingest.settings import settings

logger = logging.getLogger("__main__." + __name__)

Expand All @@ -19,6 +21,7 @@ class PersistenceMode(str, Enum):
"""Destinations for persistence"""

api = "api"
local_api = "local_api"
disk = "disk"


Expand Down Expand Up @@ -137,10 +140,12 @@ def persistence_strategy_factory(persistence_mode: PersistenceMode, **kwargs):
persistence_mode = PersistenceMode(persistence_mode)

if persistence_mode == PersistenceMode.api:
return ApiPersister(api_client=kwargs["api_client"])
return ApiPersister(api_client=get_bia_api_client())
elif persistence_mode == PersistenceMode.local_api:
return ApiPersister(api_client=get_local_bia_api_client())
elif persistence_mode == PersistenceMode.disk:
return DiskPersister(
output_dir_base=kwargs["output_dir_base"],
output_dir_base=settings.bia_data_dir,
accession_id=kwargs["accession_id"],
)
else:
Expand Down
41 changes: 41 additions & 0 deletions bia-ingest/bia_ingest/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from pathlib import Path
import os
import logging

from pydantic import Field, AliasChoices
from pydantic_settings import BaseSettings, SettingsConfigDict

logger = logging.getLogger("__main__." + __name__)


class Settings(BaseSettings):
# Note env files overwrite one another in order of the list (last element overwrites previous ones)
# Uses api settings to get user create token when testing locally.
model_config = SettingsConfigDict(
env_file=[
str(Path(__file__).parents[2] / "api" / ".env_compose"),
str(Path(__file__).parents[1] / ".env_template"),
str(Path(__file__).parents[1] / ".env"),
],
env_file_encoding="utf-8",
case_sensitive=False,
extra='ignore'
)

bia_data_dir: str = Field(
str(Path(os.environ.get("HOME", "")) / ".cache" / "bia-integrator-data-sm")
)
local_bia_api_basepath: str = Field("http://localhost:8080")
local_bia_api_username: str = Field("[email protected]")
local_bia_api_password: str = Field("test")
local_user_create_secret_token: str = Field(
validation_alias=AliasChoices(
"local_user_create_secret_token", "USER_CREATE_SECRET_TOKEN"
)
)
bia_api_basepath: str = Field("")
bia_api_username: str = Field("")
bia_api_password: str = Field("")


settings = Settings()
Loading
Loading