From fc2b5ca73363166063680df4ee05c8b4479da0ee Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Tue, 28 Mar 2023 17:05:08 +0200 Subject: [PATCH 01/31] Add data_manager_mode argument fort IDC use Idea is that we build bundles against the existing servers, then just pull the bundles into an instance that can write to cvmfs. --- src/ephemeris/run_data_managers.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index c07feb0..5d130f6 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -28,7 +28,9 @@ import logging import time from collections import namedtuple +from typing import Literal +from bioblend.galaxy import GalaxyInstance from bioblend.galaxy.tool_data import ToolDataClient from bioblend.galaxy.tools import ToolClient from jinja2 import Template @@ -48,6 +50,7 @@ DEFAULT_URL = "http://localhost" DEFAULT_SOURCE_TABLES = ["all_fasta"] +DATA_MANAGER_MODES = Literal["dry_run", "populate", "bundle"] def wait(gi, job_list, log): @@ -107,7 +110,7 @@ def get_first_valid_entry(input_dict, key_list): class DataManagers: - def __init__(self, galaxy_instance, configuration): + def __init__(self, galaxy_instance: GalaxyInstance, configuration): """ :param galaxy_instance: A GalaxyInstance object (import from bioblend.galaxy) :param configuration: A dictionary. Examples in the ephemeris documentation. @@ -245,7 +248,7 @@ def parse_items(self, items): items = json.loads(rendered_items) return items - def run(self, log=None, ignore_errors=False, overwrite=False): + def run(self, log=None, ignore_errors=False, overwrite=False, data_manager_mode: DATA_MANAGER_MODES = "populate"): """ Runs the data managers. :param log: The log to be used. @@ -277,7 +280,7 @@ def run_jobs(jobs, skipped_jobs): all_skipped_jobs.append(skipped_job) for job in jobs: started_job = self.tool_client.run_tool( - history_id=None, tool_id=job["tool_id"], tool_inputs=job["inputs"] + history_id=None, tool_id=job["tool_id"], tool_inputs=job["inputs"], data_manager_mode=data_manager_mode ) log.info( 'Dispatched job %i. Running DM: "%s" with parameters: %s' @@ -343,6 +346,7 @@ def _parser(): action="store_true", help="Do not stop running when jobs have failed.", ) + parser.add_argument("--data_manager_mode", choices=["bundle", "populate", "dry_run"], default="populate") return parser @@ -358,7 +362,7 @@ def main(): gi = get_galaxy_connection(args, file=args.config, log=log, login_required=True) config = load_yaml_file(args.config) data_managers = DataManagers(gi, config) - data_managers.run(log, args.ignore_errors, args.overwrite) + data_managers.run(log, args.ignore_errors, args.overwrite, data_manager_mode=args.data_manager_mode) if __name__ == "__main__": From 77978ebebcde3c85f18ef77276de92fb42fa08e6 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Tue, 18 Apr 2023 12:56:37 +0200 Subject: [PATCH 02/31] Not sure if that was necessary --- src/ephemeris/shed_tools.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ephemeris/shed_tools.py b/src/ephemeris/shed_tools.py index 35e2553..be7d99f 100644 --- a/src/ephemeris/shed_tools.py +++ b/src/ephemeris/shed_tools.py @@ -35,6 +35,7 @@ """ import datetime as dt import json +import logging import os import re import time @@ -94,6 +95,7 @@ "Loading proprietary datatypes", } +log = logging.getLogger(__name__) class InstallRepoDict(TypedDict): name: str @@ -166,7 +168,7 @@ def filter_installed_repos( def install_repositories( self, repositories: List[InstallRepoDict], - log=None, + log=log, force_latest_revision: bool = False, default_toolshed: str = "https://toolshed.g2.bx.psu.edu/", default_install_tool_dependencies: bool = False, @@ -284,7 +286,7 @@ def install_repositories( errored_repositories=errored_repositories, ) - def update_repositories(self, repositories=None, log=None, **kwargs): + def update_repositories(self, repositories=None, log=log, **kwargs): if not repositories: # Repositories None or empty list repositories = self.installed_repositories() else: @@ -307,7 +309,7 @@ def test_tools( self, test_json, repositories=None, - log=None, + log=log, test_user_api_key=None, test_user="ephemeris@galaxyproject.org", test_history_name=None, @@ -588,7 +590,7 @@ def install_repository_revision(self, repository: InstallRepoDict, log): ) return "error" - def wait_for_install(self, repository, log=None, timeout=3600): + def wait_for_install(self, repository, log=log, timeout=3600): """ If nginx times out, we look into the list of installed repositories and try to determine if a repository of the same namer/owner is still installing. From 1b3df9bf5879a860ce413e4611902f69beee38a1 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 29 Jun 2023 12:25:18 +0200 Subject: [PATCH 03/31] All setting history name in run_data_managers.py --- src/ephemeris/__init__.py | 8 ++++++++ src/ephemeris/run_data_managers.py | 24 +++++++++++++++++++----- src/ephemeris/shed_tools.py | 1 + 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/ephemeris/__init__.py b/src/ephemeris/__init__.py index 5191018..d98838a 100644 --- a/src/ephemeris/__init__.py +++ b/src/ephemeris/__init__.py @@ -16,6 +16,14 @@ ) +def get_or_create_history(history_name: str, gi: galaxy.GalaxyInstance): + histories = gi.histories.get_histories(name=history_name) + if histories: + return histories[0] + else: + return gi.histories.create_history(name=history_name) + + def check_url(url, log=None): if not url.startswith("http"): if log: diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index 5d130f6..02e6221 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -28,15 +28,17 @@ import logging import time from collections import namedtuple -from typing import Literal +from typing import Optional from bioblend.galaxy import GalaxyInstance from bioblend.galaxy.tool_data import ToolDataClient from bioblend.galaxy.tools import ToolClient from jinja2 import Template +from typing_extensions import Literal from . import ( get_galaxy_connection, + get_or_create_history, load_yaml_file, ) from .common_parser import ( @@ -248,7 +250,14 @@ def parse_items(self, items): items = json.loads(rendered_items) return items - def run(self, log=None, ignore_errors=False, overwrite=False, data_manager_mode: DATA_MANAGER_MODES = "populate"): + def run( + self, + log=None, + ignore_errors=False, + overwrite=False, + data_manager_mode: DATA_MANAGER_MODES = "populate", + history_name: Optional[str] = None, + ): """ Runs the data managers. :param log: The log to be used. @@ -263,6 +272,10 @@ def run(self, log=None, ignore_errors=False, overwrite=False, data_manager_mode: if not log: log = logging.getLogger() + history_id: Optional[str] = None + if history_name is not None: + history_id = get_or_create_history()["id"] + def run_jobs(jobs, skipped_jobs): job_list = [] for skipped_job in skipped_jobs: @@ -280,7 +293,7 @@ def run_jobs(jobs, skipped_jobs): all_skipped_jobs.append(skipped_job) for job in jobs: started_job = self.tool_client.run_tool( - history_id=None, tool_id=job["tool_id"], tool_inputs=job["inputs"], data_manager_mode=data_manager_mode + history_id=history_id, tool_id=job["tool_id"], tool_inputs=job["inputs"], data_manager_mode=data_manager_mode ) log.info( 'Dispatched job %i. Running DM: "%s" with parameters: %s' @@ -346,7 +359,8 @@ def _parser(): action="store_true", help="Do not stop running when jobs have failed.", ) - parser.add_argument("--data_manager_mode", choices=["bundle", "populate", "dry_run"], default="populate") + parser.add_argument("--data-manager-mode", "--data_manager_mode", choices=["bundle", "populate", "dry_run"], default="populate") + parser.add_argument("--history-name", default=None) return parser @@ -362,7 +376,7 @@ def main(): gi = get_galaxy_connection(args, file=args.config, log=log, login_required=True) config = load_yaml_file(args.config) data_managers = DataManagers(gi, config) - data_managers.run(log, args.ignore_errors, args.overwrite, data_manager_mode=args.data_manager_mode) + data_managers.run(log, args.ignore_errors, args.overwrite, data_manager_mode=args.data_manager_mode, history_name=args.history_name) if __name__ == "__main__": diff --git a/src/ephemeris/shed_tools.py b/src/ephemeris/shed_tools.py index be7d99f..0f17bd0 100644 --- a/src/ephemeris/shed_tools.py +++ b/src/ephemeris/shed_tools.py @@ -97,6 +97,7 @@ log = logging.getLogger(__name__) + class InstallRepoDict(TypedDict): name: str owner: str From 943b73721ac45459a101fc596af4492044f23c95 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Thu, 29 Jun 2023 12:09:47 -0400 Subject: [PATCH 04/31] Pass args to get_or_create_history() --- src/ephemeris/run_data_managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index 02e6221..2b70eab 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -274,7 +274,7 @@ def run( history_id: Optional[str] = None if history_name is not None: - history_id = get_or_create_history()["id"] + history_id = get_or_create_history(history_name, self.gi)["id"] def run_jobs(jobs, skipped_jobs): job_list = [] From bb4494ecf72b68b19fbdf43eb6eb84b85f3698f6 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Thu, 29 Jun 2023 16:37:06 -0400 Subject: [PATCH 05/31] split genomes for IDC --- setup.py | 3 + src/ephemeris/_config_models.py | 60 +++++ src/ephemeris/_idc_data_managers_to_tools.py | 97 ++++++++ src/ephemeris/_idc_lint.py | 31 +++ .../_idc_split_data_manager_genomes.py | 223 ++++++++++++++++++ src/ephemeris/common_parser.py | 30 ++- tests/__init__.py | 0 tests/conftest.py | 3 +- tests/test_idc_lint.py | 9 + tests/test_split_genomes.py | 83 +++++++ 10 files changed, 526 insertions(+), 13 deletions(-) create mode 100644 src/ephemeris/_config_models.py create mode 100644 src/ephemeris/_idc_data_managers_to_tools.py create mode 100644 src/ephemeris/_idc_lint.py create mode 100644 src/ephemeris/_idc_split_data_manager_genomes.py create mode 100644 tests/__init__.py create mode 100644 tests/test_idc_lint.py create mode 100644 tests/test_split_genomes.py diff --git a/setup.py b/setup.py index 1950610..d29e0e0 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,9 @@ def get_var(var_name): install_tool_deps=ephemeris.install_tool_deps:main install-tool-deps=ephemeris.install_tool_deps:main set-library-permissions=ephemeris.set_library_permissions:main + _idc-lint=ephemeris._idc_lint:main + _idc-split-data-manager-genomes=ephemeris._idc_split_data_manager_genomes:main + _idc-data-managers-to-tools=ephemeris._idc_data_managers_to_tools:main ''' PACKAGE_DATA = { # Be sure to update MANIFEST.in for source dist. diff --git a/src/ephemeris/_config_models.py b/src/ephemeris/_config_models.py new file mode 100644 index 0000000..3b84925 --- /dev/null +++ b/src/ephemeris/_config_models.py @@ -0,0 +1,60 @@ +from pathlib import Path +from typing import ( + Dict, + List, + Optional, + Union, +) + +import yaml +from pydantic import ( + BaseModel, + Extra, +) + + +StrOrPath = Union[Path, str] + + +class DataManager(BaseModel, extra=Extra.forbid): + tags: List[str] + tool_id: str + + +class DataManagers(BaseModel, extra=Extra.forbid): + __root__: Dict[str, DataManager] + + +class Genome(BaseModel): + id: str # The unique id of the data in Galaxy + description: str # The description of the data, including its taxonomy, version and date + dbkey: Optional[str] + source: Optional[str] # The source of the data. Can be: 'ucsc', an NCBI accession number or a URL to a fasta file. + + # The following fields are currently purely for human consumption and unused by + # IDC infrastructure. + doi: Optional[str] # Any DOI associated with the data + blob: Optional[str] # A blob for any other pertinent information + checksum: Optional[str] # A SHA256 checksum of the original + version: Optional[str] # Any version information associated with the data + + # Description of actions (data managers) to run on target genome. + indexers: Optional[List[str]] # indexers to run - keyed on repository name - see data_managers.yml for how to resolve these to tools + skiplist: Optional[List[str]] # unimplemented: but if we implement classes of indexers, these will be ones to skip + + +class Genomes(BaseModel): + genomes: List[Genome] + + +def _read_yaml(path: StrOrPath): + with open(path, "r") as f: + return yaml.safe_load(f) + + +def read_data_managers(path: StrOrPath) -> DataManagers: + return DataManagers(__root__=_read_yaml(path)) + + +def read_genomes(path: StrOrPath) -> Genomes: + return Genomes(**_read_yaml(path)) diff --git a/src/ephemeris/_idc_data_managers_to_tools.py b/src/ephemeris/_idc_data_managers_to_tools.py new file mode 100644 index 0000000..fdb39f4 --- /dev/null +++ b/src/ephemeris/_idc_data_managers_to_tools.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +"""Helper script for IDC - not yet meant for public consumption. + +This script takes a data_managers.yml configuration describing the +set of data managers the IDC configuration targets and builds a +a tools.yml file from it for use with shed_tools. +""" +import argparse +import logging +from typing import ( + Dict, + List, + NamedTuple, +) + +import yaml + +from ._config_models import read_data_managers +from .common_parser import ( + add_log_file_argument, + add_verbosity_argument, +) +from .ephemeris_log import ( + disable_external_library_logging, + setup_global_logger, +) + + +class DataManager(NamedTuple): + tool_id: str + repository_name: str + tags: List[str] + + +def read_data_managers_configuration(path: str) -> Dict[str, DataManager]: + raw_data_managers = read_data_managers(path) + data_managers: Dict[str, DataManager] = {} + for repository_name, data_manager_configuration in raw_data_managers.__root__.items(): + data_manager = DataManager( + tool_id=data_manager_configuration.tool_id, + repository_name=repository_name, + tags=data_manager_configuration.tags or [], + ) + data_managers[repository_name] = data_manager + return data_managers + + +def build_shed_install_conf(path: str) -> dict: + data_managers = read_data_managers_configuration(path) + tools = [] + for data_manager in data_managers.values(): + tool_id = data_manager.tool_id + tool_id_parts = tool_id.split("/") + repo_owner = tool_id_parts[2] + repo_name = tool_id_parts[3] + entry = { + "name": repo_name, + "owner": repo_owner, + "tool_panel_section_label": None, + "tool_shed_url": "toolshed.g2.bx.psu.edu", + } + tools.append(entry) + tools_yaml = {"tools": tools} + return tools_yaml + + +def write_shed_install_conf(data_manager_conf_path: str, output_path: str) -> None: + tools_yaml = build_shed_install_conf(data_manager_conf_path) + with open(output_path, "w") as f: + yaml.safe_dump(tools_yaml, f) + + +def _parser(): + """returns the parser object.""" + + parser = argparse.ArgumentParser(add_help=False) + general_group = parser.add_argument_group("General options") + add_verbosity_argument(general_group) + add_log_file_argument(general_group) + parser.add_argument('--data-managers-conf', default="data_managers.yml") + parser.add_argument('--shed-install-output-conf', default="tools.yml") + + +def main(): + disable_external_library_logging() + parser = _parser() + args = parser.parse_args() + log = setup_global_logger(name=__name__, log_file=args.log_file) + if args.verbose: + log.setLevel(logging.DEBUG) + else: + log.setLevel(logging.INFO) + write_shed_install_conf(args.data_managers_conf, args.shed_install_output_conf) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/_idc_lint.py b/src/ephemeris/_idc_lint.py new file mode 100644 index 0000000..859f7a8 --- /dev/null +++ b/src/ephemeris/_idc_lint.py @@ -0,0 +1,31 @@ +import os +from pathlib import Path + +import yaml + +from ._config_models import ( + read_data_managers, + read_genomes, +) + + +def read_yaml(path: Path): + with open(path, "r") as f: + return yaml.safe_load(f) + + +def lint_idc_directory(directory: Path): + genomes_path = directory / "genomes.yml" + data_managers_path = directory / "data_managers.yml" + assert genomes_path.exists() + assert data_managers_path.exists() + read_data_managers(data_managers_path) + read_genomes(genomes_path) + + +def main(): + lint_idc_directory(Path(os.curdir)) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py new file mode 100644 index 0000000..7740ef2 --- /dev/null +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python +"""Helper script for IDC - not yet meant for public consumption. + +This script splits genomes.yml into tasks that are meant to be sent to +run_data_managers.py - while excluding data managers executions specified +by genomes.yml that have already been executed and appear in the target +installed data table configuration. +""" +import logging +import os +import re +from copy import deepcopy +from typing import ( + Any, + Callable, + Dict, + List, + Optional, +) + +import yaml +from galaxy.util import safe_makedirs +from pydantic import ( + BaseModel, + Extra, +) + +from . import get_galaxy_connection +from .common_parser import ( + get_common_args, +) +from ._idc_data_managers_to_tools import ( + DataManager, + read_data_managers_configuration, +) +from .ephemeris_log import ( + disable_external_library_logging, + setup_global_logger, +) + +IsBuildComplete = Callable[[str, str], bool] +TASK_FILE_NAME = "run_data_managers.yaml" + + +class SplitOptions: + merged_genomes_path: str + split_genomes_path: str + data_managers_path: str + is_build_complete: IsBuildComplete + + +def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str: + data_manager = data_managers[indexer] + assert data_manager, f"Could not find a target data manager for indexer name {indexer}" + return data_manager.tool_id + + +class RunDataManager(BaseModel): + id: str + items: Optional[List[Any]] = None + params: Optional[List[Any]] = None + data_table_reload: Optional[List[str]] = None + + +class RunDataManagers(BaseModel): + data_managers: List[RunDataManager] + + +class DataManager(BaseModel, extra=Extra.forbid): + tags: List[str] + tool_id: str + + +class DataManagers(BaseModel, extra=Extra.forbid): + __root__: Dict[str, DataManager] + + +class Genome(BaseModel): + pass + + +class Genomes(BaseModel): + genomes: List[Genome] + + +def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str): + parent, _ = os.path.split(path) + if not os.path.exists(parent): + safe_makedirs(parent) + run_data_managers = RunDataManagers(data_managers=[run_data_manager]) + with open(path, "w") as of: + yaml.safe_dump(run_data_managers.dict(), of) + + +def split_genomes(split_options: SplitOptions) -> None: + + def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: str): + split_genomes_path = split_options.split_genomes_path + if not os.path.exists(split_options.split_genomes_path): + safe_makedirs(split_genomes_path) + + task_file_dir = os.path.join(split_genomes_path, build_id, indexer) + task_file = os.path.join(task_file_dir, TASK_FILE_NAME) + write_run_data_manager_to_file(run_data_manager, task_file) + + data_managers = read_data_managers_configuration(split_options.data_managers_path) + with open(split_options.merged_genomes_path) as f: + genomes_all = yaml.safe_load(f) + genomes = genomes_all["genomes"] + for genome in genomes: + build_id = genome["id"] + + fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta" + if not split_options.is_build_complete(build_id, fetch_indexer): + fetch_tool_id = tool_id_for(fetch_indexer, data_managers) + fetch_params = [] + fetch_params.append({"dbkey_source|dbkey": genome["id"]}) + source = genome.get("source") + if source is None: + continue + elif source == "ucsc": + fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) + fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) + elif re.match("^[A-Z_]+[0-9.]+", source): + fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) + fetch_params.append({"reference_source|reference_source_selector": "ncbi"}) + fetch_params.append( + {"reference_source|requested_identifier": source} + ) + fetch_params.append({"sequence_name": genome["description"]}) + fetch_params.append({"sequence.id": genome["id"]}) + elif re.match("^http", source): + fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) + fetch_params.append({"reference_source|reference_source_selector": "url"}) + fetch_params.append({"reference_source|user_url": source}) + fetch_params.append({"sequence_name": genome["description"]}) + fetch_params.append({"sequence.id": genome["id"]}) + + fetch_run_data_manager = RunDataManager( + id=fetch_tool_id, + params=fetch_params, + # Not needed according to Marius + # data_table_reload=["all_fasta", "__dbkeys__"], + ) + write_task_file(fetch_run_data_manager, build_id, fetch_indexer) + + indexers = genome.get("indexers", []) + for indexer in indexers: + if split_options.is_build_complete(build_id, indexer): + continue + + data_manager = {} + tool_id = tool_id_for(indexer, data_managers) + params = [ + {"all_fasta_source": "{{ item.id }}"}, + {"sequence_name": "{{ item.name }}"}, + {"sequence_id": "{{ item.id }}"}, + ] + if re.search("bwa", tool_id): + data_manager["params"].append({"index_algorithm": "bwtsw"}) + if re.search("color_space", tool_id): + continue + + item = deepcopy(genome) + item.pop("indexers", None) + item.pop("blacklist", None) + + run_data_manager = RunDataManager( + id=tool_id, + params=params, + items=[item], + ) + write_task_file(run_data_manager, build_id, indexer) + + +class GalaxyHistoryIsBuildComplete: + + def __init__(self, history_names: List[str]): + self._history_names = history_names + + def __call__(self, build_id: str, indexer_name: str) -> bool: + target_history_name = f"idc-{build_id}-{indexer_name}" + return target_history_name in self._history_names + + +def _parser(): + """returns the parser object.""" + # login required to check history... + parser = get_common_args(login_required=True) + + parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") + parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") + parser.add_argument('--data-managers-path', default="data_managers.yml") + + +def get_galaxy_history_names(args) -> List[str]: + gi = get_galaxy_connection(args, login_required=True) + return [h["name"] for h in gi.histories.get_histories()] + + +def main(): + disable_external_library_logging() + parser = _parser() + args = parser.parse_args() + log = setup_global_logger(name=__name__, log_file=args.log_file) + if args.verbose: + log.setLevel(logging.DEBUG) + else: + log.setLevel(logging.INFO) + + is_build_complete = GalaxyHistoryIsBuildComplete(get_galaxy_history_names(args)) + + split_options = SplitOptions() + split_options.data_managers_path = args.data_managers_path + split_options.merged_genomes_path = args.merged_genomes_path + split_options.split_genomes_path = args.split_genomes_path + split_options.is_build_complete = is_build_complete + + split_genomes(split_options) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/common_parser.py b/src/ephemeris/common_parser.py index 6992000..569aede 100644 --- a/src/ephemeris/common_parser.py +++ b/src/ephemeris/common_parser.py @@ -18,21 +18,29 @@ class ArgumentDefaultsHideUnderscoresHelpFormatter(HideUnderscoresHelpFormatter, pass +def add_verbosity_argument(parser_or_group): + parser_or_group.add_argument( + "-v", "--verbose", help="Increase output verbosity.", action="store_true" + ) + + +def add_log_file_argument(parser_or_group): + parser_or_group.add_argument( + "--log-file", + "--log_file", + dest="log_file", + help="Where the log file should be stored. " + "Default is a file in your system's temp folder", + default=None, + ) + + def get_common_args(login_required=True, log_file=False): parser = argparse.ArgumentParser(add_help=False) general_group = parser.add_argument_group("General options") - general_group.add_argument( - "-v", "--verbose", help="Increase output verbosity.", action="store_true" - ) + add_verbosity_argument(general_group) if log_file: - general_group.add_argument( - "--log-file", - "--log_file", - dest="log_file", - help="Where the log file should be stored. " - "Default is a file in your system's temp folder", - default=None, - ) + add_log_file_argument(general_group) con_group = parser.add_argument_group("Galaxy connection") con_group.add_argument( diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py index 56a89cc..ee4c3e0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,8 +13,6 @@ GALAXY_ADMIN_PASSWORD = "password" GALAXY_ADMIN_USER = "admin@galaxy.org" -client = docker.from_env() - GalaxyContainer = namedtuple( "GalaxyContainer", ["url", "container", "attributes", "gi"] ) @@ -32,6 +30,7 @@ def start_container(**kwargs): key = kwargs.get("api_key", GALAXY_ADMIN_KEY) ensure_admin = kwargs.get("ensure_admin", True) + client = docker.from_env() container = client.containers.run( GALAXY_IMAGE, detach=True, ports={"80/tcp": None}, **kwargs ) diff --git a/tests/test_idc_lint.py b/tests/test_idc_lint.py new file mode 100644 index 0000000..50de40c --- /dev/null +++ b/tests/test_idc_lint.py @@ -0,0 +1,9 @@ +from pathlib import Path + +from ephemeris._idc_lint import lint_idc_directory +from .test_split_genomes import setup_mock_idc_dir + + +def test_idc_lint(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + lint_idc_directory(tmp_path) diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py new file mode 100644 index 0000000..e623321 --- /dev/null +++ b/tests/test_split_genomes.py @@ -0,0 +1,83 @@ +from pathlib import Path + +import yaml + +from ephemeris._idc_split_data_manager_genomes import ( + GalaxyHistoryIsBuildComplete, + RunDataManagers, + split_genomes, + SplitOptions, +) + +MERGED_YAML_STR = """ +genomes: + - dbkey: hg19_rCRS_pUC18_phiX174 + description: Homo sapiens (hg19 with mtDNA replaced with rCRS, and containing pUC18 + and phiX174) + source: http://datacache.galaxyproject.org/managed/seq/hg19_rCRS_pUC18_phiX174.fa + id: hg19_rCRS_pUC18_phiX174 + indexers: + - data_manager_twobit_builder + - data_manager_star_index_builder + + - dbkey: rn6 + description: Rat Jul. 2014 (RGSC 6.0/rn6) (rn6) + id: rn6 + source: ucsc + indexers: + - data_manager_twobit_builder + - data_manager_picard_index_builder +""" + +DATA_MANAGER_YAML_STR = """ +data_manager_fetch_genome_dbkeys_all_fasta: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta/data_manager_fetch_genome_all_fasta_dbkey/0.0.3' + tags: + - fetch_source +data_manager_twobit_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2' + tags: + - genome +data_manager_picard_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_picard_index_builder/data_manager/picard_index_builder/0.0.1' + tags: + - genome +data_manager_star_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/iuc/data_manager_star_index_builder/rna_star_index_builder_data_manager/0.0.5' + tags: + - genome +""" + + +def setup_mock_idc_dir(directory: Path): + merged = directory / "genomes.yml" + merged.write_text(MERGED_YAML_STR) + + data_managers = directory / "data_managers.yml" + data_managers.write_text(DATA_MANAGER_YAML_STR) + + +def read_and_validate_run_data_manager_yaml(path): + with open(path, "r") as f: + return RunDataManagers(**yaml.safe_load(f)) + + +def test_split_genomes(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + + split_path = tmp_path / "split" + + history_names = ["idc-hg19_rCRS_pUC18_phiX174-data_manager_star_index_builder"] + is_build_complete = GalaxyHistoryIsBuildComplete(history_names) + + split_options = SplitOptions() + split_options.merged_genomes_path = tmp_path / "genomes.yml" + split_options.split_genomes_path = str(split_path) + split_options.data_managers_path = tmp_path / "data_managers.yml" + split_options.is_build_complete = is_build_complete + split_genomes(split_options) + new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + complete_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_star_index_builder" + assert new_task.exists() + assert not complete_task.exists() + read_and_validate_run_data_manager_yaml(new_task / "run_data_managers.yaml") From b808e909cd45f163c85e49facdc0964a2ded9689 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Fri, 30 Jun 2023 18:47:22 -0400 Subject: [PATCH 06/31] Actually return the parser --- src/ephemeris/_idc_data_managers_to_tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ephemeris/_idc_data_managers_to_tools.py b/src/ephemeris/_idc_data_managers_to_tools.py index fdb39f4..ca8127b 100644 --- a/src/ephemeris/_idc_data_managers_to_tools.py +++ b/src/ephemeris/_idc_data_managers_to_tools.py @@ -79,6 +79,7 @@ def _parser(): add_log_file_argument(general_group) parser.add_argument('--data-managers-conf', default="data_managers.yml") parser.add_argument('--shed-install-output-conf', default="tools.yml") + return parser def main(): From 6c01e0cb0d2585596c6b4947cd989f34ebd38e55 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Fri, 30 Jun 2023 20:39:01 -0400 Subject: [PATCH 07/31] Fix _idc_split_data_manager_genomes parser --- src/ephemeris/_idc_split_data_manager_genomes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index 7740ef2..a1f82b7 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -186,11 +186,11 @@ def __call__(self, build_id: str, indexer_name: str) -> bool: def _parser(): """returns the parser object.""" # login required to check history... - parser = get_common_args(login_required=True) - + parser = get_common_args(login_required=True, log_file=True) parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") parser.add_argument('--data-managers-path', default="data_managers.yml") + return parser def get_galaxy_history_names(args) -> List[str]: From 55157a83de4f957c02c74ae1ef9e1a6a5c02d2d4 Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Fri, 30 Jun 2023 21:12:51 -0400 Subject: [PATCH 08/31] Include description in fetch --- src/ephemeris/_idc_split_data_manager_genomes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index a1f82b7..5de770b 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -121,6 +121,7 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st elif source == "ucsc": fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) + fetch_params.append({"sequence_name": genome["description"]}) elif re.match("^[A-Z_]+[0-9.]+", source): fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) fetch_params.append({"reference_source|reference_source_selector": "ncbi"}) From b3de919abb96c6fb6c0274184de8e5f37b5054ee Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Fri, 30 Jun 2023 21:13:03 -0400 Subject: [PATCH 09/31] Add logging --- src/ephemeris/_idc_split_data_manager_genomes.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index 5de770b..ad0ac8f 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -41,6 +41,8 @@ IsBuildComplete = Callable[[str, str], bool] TASK_FILE_NAME = "run_data_managers.yaml" +log = logging.getLogger(__name__) + class SplitOptions: merged_genomes_path: str @@ -112,6 +114,7 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta" if not split_options.is_build_complete(build_id, fetch_indexer): + log.info(f"Fetching: {build_id}") fetch_tool_id = tool_id_for(fetch_indexer, data_managers) fetch_params = [] fetch_params.append({"dbkey_source|dbkey": genome["id"]}) @@ -144,21 +147,26 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st # data_table_reload=["all_fasta", "__dbkeys__"], ) write_task_file(fetch_run_data_manager, build_id, fetch_indexer) + else: + log.debug(f"Fetch is already completed: {build_id}") indexers = genome.get("indexers", []) for indexer in indexers: if split_options.is_build_complete(build_id, indexer): + log.debug(f"Build is already completed: {build_id} {indexer}") continue - data_manager = {} + log.info(f"Building: {build_id} {indexer}") + tool_id = tool_id_for(indexer, data_managers) params = [ {"all_fasta_source": "{{ item.id }}"}, {"sequence_name": "{{ item.name }}"}, {"sequence_id": "{{ item.id }}"}, ] + # why is this not pulled from the data managers conf? -nate if re.search("bwa", tool_id): - data_manager["params"].append({"index_algorithm": "bwtsw"}) + params.append({"index_algorithm": "bwtsw"}) if re.search("color_space", tool_id): continue From 3e963180f34c3c7c23a33b197690ace0042f63ee Mon Sep 17 00:00:00 2001 From: Nate Coraor Date: Fri, 30 Jun 2023 22:13:08 -0400 Subject: [PATCH 10/31] Set defaults compatible with run-data-managers --- src/ephemeris/_idc_split_data_manager_genomes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index ad0ac8f..e371497 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -59,9 +59,9 @@ def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str: class RunDataManager(BaseModel): id: str - items: Optional[List[Any]] = None + items: Optional[List[Any]] = [] params: Optional[List[Any]] = None - data_table_reload: Optional[List[str]] = None + data_table_reload: Optional[List[str]] = [] class RunDataManagers(BaseModel): @@ -172,7 +172,7 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st item = deepcopy(genome) item.pop("indexers", None) - item.pop("blacklist", None) + item.pop("skiplist", None) run_data_manager = RunDataManager( id=tool_id, From 6c0ed8b1e35972cb17761e4c0db5771eb6117d39 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 12:19:08 -0400 Subject: [PATCH 11/31] Handle empty items and data reload fields in run_data_managers. --- src/ephemeris/run_data_managers.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index 2b70eab..9dcdc7b 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -159,8 +159,8 @@ def get_dm_jobs(self, dm): :returns job_list, skipped_job_list""" job_list = [] skipped_job_list = [] - items = self.parse_items(dm.get("items", [""])) - for item in items: + + def handle_item(item: str): dm_id = dm["id"] params = dm["params"] inputs = dict() @@ -174,11 +174,20 @@ def get_dm_jobs(self, dm): job = dict(tool_id=dm_id, inputs=inputs) - data_tables = dm.get("data_table_reload", []) + data_tables = dm.get("data_table_reload") or [] if self.input_entries_exist_in_data_tables(data_tables, inputs): skipped_job_list.append(job) else: job_list.append(job) + + raw_items = dm.get("items") or None + if raw_items: + items = self.parse_items(raw_items) + for item in items: + handle_item(item) + else: + handle_item("") + return job_list, skipped_job_list def dm_is_fetcher(self, dm): From 33f6b281bb414ea84fd59a215aed31b23c06a460 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 12:32:07 -0400 Subject: [PATCH 12/31] Don't serialize unset fields when splitting genomes.yml into run tasks. --- src/ephemeris/_idc_split_data_manager_genomes.py | 6 +++--- tests/test_split_genomes.py | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index e371497..038865e 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -59,9 +59,9 @@ def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str: class RunDataManager(BaseModel): id: str - items: Optional[List[Any]] = [] + items: Optional[List[Any]] = None params: Optional[List[Any]] = None - data_table_reload: Optional[List[str]] = [] + data_table_reload: Optional[List[str]] = None class RunDataManagers(BaseModel): @@ -91,7 +91,7 @@ def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str): safe_makedirs(parent) run_data_managers = RunDataManagers(data_managers=[run_data_manager]) with open(path, "w") as of: - yaml.safe_dump(run_data_managers.dict(), of) + yaml.safe_dump(run_data_managers.dict(exclude_unset=True), of) def split_genomes(split_options: SplitOptions) -> None: diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index e623321..5dc7c5a 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -80,4 +80,12 @@ def test_split_genomes(tmp_path: Path): complete_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_star_index_builder" assert new_task.exists() assert not complete_task.exists() - read_and_validate_run_data_manager_yaml(new_task / "run_data_managers.yaml") + new_task_run_yaml = new_task / "run_data_managers.yaml" + # ensure we don't serialize unset fields + assert "data_table_reload" not in new_task_run_yaml.read_text() + run = read_and_validate_run_data_manager_yaml(new_task_run_yaml) + assert len(run.data_managers) == 1 + data_manager = run.data_managers[0] + assert data_manager.id == "toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2" + assert data_manager.items[0]["id"] == "hg19_rCRS_pUC18_phiX174" + assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174" From 5e0646791e5a77e5f76c81a5ea20b177e6b2fea5 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 12:38:29 -0400 Subject: [PATCH 13/31] Lint fixes & fix for adding __init__ in tests. --- src/ephemeris/_config_models.py | 4 ++-- src/ephemeris/_idc_split_data_manager_genomes.py | 6 +++--- tests/test_run_data_managers.py | 10 +++++----- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ephemeris/_config_models.py b/src/ephemeris/_config_models.py index 3b84925..7eff971 100644 --- a/src/ephemeris/_config_models.py +++ b/src/ephemeris/_config_models.py @@ -34,8 +34,8 @@ class Genome(BaseModel): # The following fields are currently purely for human consumption and unused by # IDC infrastructure. doi: Optional[str] # Any DOI associated with the data - blob: Optional[str] # A blob for any other pertinent information - checksum: Optional[str] # A SHA256 checksum of the original + blob: Optional[str] # A blob for any other pertinent information + checksum: Optional[str] # A SHA256 checksum of the original version: Optional[str] # Any version information associated with the data # Description of actions (data managers) to run on target genome. diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index 038865e..dfcc770 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -26,13 +26,13 @@ ) from . import get_galaxy_connection -from .common_parser import ( - get_common_args, -) from ._idc_data_managers_to_tools import ( DataManager, read_data_managers_configuration, ) +from .common_parser import ( + get_common_args, +) from .ephemeris_log import ( disable_external_library_logging, setup_global_logger, diff --git a/tests/test_run_data_managers.py b/tests/test_run_data_managers.py index ccee537..bf43b65 100644 --- a/tests/test_run_data_managers.py +++ b/tests/test_run_data_managers.py @@ -7,16 +7,16 @@ import pytest import yaml -from conftest import ( - GALAXY_ADMIN_KEY, - GALAXY_ADMIN_PASSWORD, - GALAXY_ADMIN_USER, -) from ephemeris import run_data_managers from ephemeris.run_data_managers import DataManagers from ephemeris.shed_tools import InstallRepositoryManager from ephemeris.sleep import galaxy_wait +from .conftest import ( + GALAXY_ADMIN_KEY, + GALAXY_ADMIN_PASSWORD, + GALAXY_ADMIN_USER, +) AUTH_BY = "key" From 542a883eee17f0fd84fa40fcd69407a5f2d6614b Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 12:55:28 -0400 Subject: [PATCH 14/31] Improved IDC linting... --- src/ephemeris/_idc_lint.py | 10 ++++++++-- tests/test_idc_lint.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/ephemeris/_idc_lint.py b/src/ephemeris/_idc_lint.py index 859f7a8..bf02745 100644 --- a/src/ephemeris/_idc_lint.py +++ b/src/ephemeris/_idc_lint.py @@ -19,8 +19,14 @@ def lint_idc_directory(directory: Path): data_managers_path = directory / "data_managers.yml" assert genomes_path.exists() assert data_managers_path.exists() - read_data_managers(data_managers_path) - read_genomes(genomes_path) + data_managers = read_data_managers(data_managers_path).__root__ + genomes = read_genomes(genomes_path) + print(genomes) + for genome in genomes.genomes: + print(genome) + for indexer in (genome.indexers or []): + if indexer not in data_managers: + raise Exception(f"Failed to find data manager {indexer} referenced for genome {genome}") def main(): diff --git a/tests/test_idc_lint.py b/tests/test_idc_lint.py index 50de40c..5ec997a 100644 --- a/tests/test_idc_lint.py +++ b/tests/test_idc_lint.py @@ -1,9 +1,40 @@ from pathlib import Path +import pytest + from ephemeris._idc_lint import lint_idc_directory from .test_split_genomes import setup_mock_idc_dir -def test_idc_lint(tmp_path: Path): +MISSPELLED_DATA_MANAGER_YAML_STR = """ +data_manager_fetch_genome_dbkeys_all_fasta: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta/data_manager_fetch_genome_all_fasta_dbkey/0.0.3' + tags: + - fetch_source +data_manager_two_bit_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2' + tags: + - genome +data_manager_picard_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_picard_index_builder/data_manager/picard_index_builder/0.0.1' + tags: + - genome +data_manager_star_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/iuc/data_manager_star_index_builder/rna_star_index_builder_data_manager/0.0.5' + tags: + - genome +""" + + +def test_idc_lint_valid(tmp_path: Path): setup_mock_idc_dir(tmp_path) lint_idc_directory(tmp_path) + + +def test_idc_lint_misspelled_dm(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + (tmp_path / "data_managers.yml").write_text(MISSPELLED_DATA_MANAGER_YAML_STR) + with pytest.raises(Exception) as exc_info: + lint_idc_directory(tmp_path) + # misspelled two_bit in data managers so data_manager_twobit_builder is missing + assert "data_manager_twobit_builder" in str(exc_info.value) From 3a822ddb0e2d8fff1eb80843b0f9157fa1ff4de1 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 14:42:15 -0400 Subject: [PATCH 15/31] More shed linting... --- src/ephemeris/_idc_lint.py | 7 ++++++- tests/test_idc_lint.py | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/ephemeris/_idc_lint.py b/src/ephemeris/_idc_lint.py index bf02745..e8949ea 100644 --- a/src/ephemeris/_idc_lint.py +++ b/src/ephemeris/_idc_lint.py @@ -21,7 +21,12 @@ def lint_idc_directory(directory: Path): assert data_managers_path.exists() data_managers = read_data_managers(data_managers_path).__root__ genomes = read_genomes(genomes_path) - print(genomes) + + for data_manager in data_managers.values(): + data_manager_tool_id = data_manager.tool_id + if not data_manager_tool_id.startswith("toolshed.g2.bx.psu.edu/"): + raise Exception(f"Expected a data manager repository from main Galaxy tool shed but discovered tool ID {data_manager_tool_id}") + for genome in genomes.genomes: print(genome) for indexer in (genome.indexers or []): diff --git a/tests/test_idc_lint.py b/tests/test_idc_lint.py index 5ec997a..3db0e74 100644 --- a/tests/test_idc_lint.py +++ b/tests/test_idc_lint.py @@ -25,6 +25,25 @@ - genome """ +TESTTOOLSHED_DATA_MANAGER_YAML_STR = """ +data_manager_fetch_genome_dbkeys_all_fasta: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta/data_manager_fetch_genome_all_fasta_dbkey/0.0.3' + tags: + - fetch_source +data_manager_twobit_builder: + tool_id: 'testtoolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2' + tags: + - genome +data_manager_picard_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_picard_index_builder/data_manager/picard_index_builder/0.0.1' + tags: + - genome +data_manager_star_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/iuc/data_manager_star_index_builder/rna_star_index_builder_data_manager/0.0.5' + tags: + - genome +""" + def test_idc_lint_valid(tmp_path: Path): setup_mock_idc_dir(tmp_path) @@ -38,3 +57,8 @@ def test_idc_lint_misspelled_dm(tmp_path: Path): lint_idc_directory(tmp_path) # misspelled two_bit in data managers so data_manager_twobit_builder is missing assert "data_manager_twobit_builder" in str(exc_info.value) + + (tmp_path / "data_managers.yml").write_text(TESTTOOLSHED_DATA_MANAGER_YAML_STR) + with pytest.raises(Exception) as exc_info: + lint_idc_directory(tmp_path) + assert "testtoolshed" in str(exc_info.value) From b0d0a0babae521337d0cab40aa9398f8e6470dca Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 14:47:54 -0400 Subject: [PATCH 16/31] Test case for data manager tools YAML generation. --- src/ephemeris/_config_models.py | 23 ++++++++++++++++++++ src/ephemeris/_idc_data_managers_to_tools.py | 9 +++++++- tests/test_idc_data_managers_to_tools.py | 13 +++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 tests/test_idc_data_managers_to_tools.py diff --git a/src/ephemeris/_config_models.py b/src/ephemeris/_config_models.py index 7eff971..ce74804 100644 --- a/src/ephemeris/_config_models.py +++ b/src/ephemeris/_config_models.py @@ -16,6 +16,25 @@ StrOrPath = Union[Path, str] +class RepositoryInstallTarget(BaseModel): + name: str + owner: str + tool_shed_url: Optional[str] + tool_panel_section_id: Optional[str] + tool_panel_section_label: Optional[str] + revisions: Optional[List[str]] + install_tool_dependencies: Optional[bool] + install_repository_dependencies: Optional[bool] + install_resolver_dependencies: Optional[bool] + + +class RepositoryInstallTargets(BaseModel): + """ """ + api_key: Optional[str] + galaxy_instance: Optional[str] + tools: List[RepositoryInstallTarget] + + class DataManager(BaseModel, extra=Extra.forbid): tags: List[str] tool_id: str @@ -58,3 +77,7 @@ def read_data_managers(path: StrOrPath) -> DataManagers: def read_genomes(path: StrOrPath) -> Genomes: return Genomes(**_read_yaml(path)) + + +def read_tools(path: StrOrPath) -> RepositoryInstallTargets: + return RepositoryInstallTargets(**_read_yaml(path)) diff --git a/src/ephemeris/_idc_data_managers_to_tools.py b/src/ephemeris/_idc_data_managers_to_tools.py index ca8127b..20f7b68 100644 --- a/src/ephemeris/_idc_data_managers_to_tools.py +++ b/src/ephemeris/_idc_data_managers_to_tools.py @@ -15,7 +15,10 @@ import yaml -from ._config_models import read_data_managers +from ._config_models import ( + read_data_managers, + RepositoryInstallTargets, +) from .common_parser import ( add_log_file_argument, add_verbosity_argument, @@ -66,6 +69,10 @@ def build_shed_install_conf(path: str) -> dict: def write_shed_install_conf(data_manager_conf_path: str, output_path: str) -> None: tools_yaml = build_shed_install_conf(data_manager_conf_path) + + # validate generated dict to ensure we're writing out valid file + RepositoryInstallTargets(**tools_yaml) + with open(output_path, "w") as f: yaml.safe_dump(tools_yaml, f) diff --git a/tests/test_idc_data_managers_to_tools.py b/tests/test_idc_data_managers_to_tools.py new file mode 100644 index 0000000..f00a647 --- /dev/null +++ b/tests/test_idc_data_managers_to_tools.py @@ -0,0 +1,13 @@ +from pathlib import Path + +from ephemeris._config_models import read_tools +from ephemeris._idc_data_managers_to_tools import write_shed_install_conf +from .test_split_genomes import setup_mock_idc_dir + + +def test_idc_lint_valid(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + output_path = tmp_path / "output.yaml" + write_shed_install_conf(tmp_path / "data_managers.yml", output_path) + # validate the generated tools file... + read_tools(output_path) From 7285560dab65838c8ede73d13da5f005895b5300 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 15:30:17 -0400 Subject: [PATCH 17/31] Implement stages and filtering in IDC split script. --- .../_idc_split_data_manager_genomes.py | 80 ++++++++++++++---- tests/test_split_genomes.py | 81 +++++++++++++++++-- 2 files changed, 138 insertions(+), 23 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index dfcc770..ee68441 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -44,11 +44,27 @@ log = logging.getLogger(__name__) +class Filters: + stage: Optional[int] = None + data_manager: Optional[str] = None + build_id: Optional[str] = None + + def filter_out_data_manager(self, data_manager: str) -> bool: + return bool(self.data_manager and data_manager != self.data_manager) + + def filter_out_build_id(self, build_id: str) -> bool: + return bool(self.build_id and build_id != self.build_id) + + def filter_out_stage(self, stage: int) -> bool: + return bool(self.stage is not None and self.stage != stage) + + class SplitOptions: merged_genomes_path: str split_genomes_path: str data_managers_path: str is_build_complete: IsBuildComplete + filters: Filters = Filters() def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str: @@ -94,34 +110,31 @@ def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str): yaml.safe_dump(run_data_managers.dict(exclude_unset=True), of) -def split_genomes(split_options: SplitOptions) -> None: - - def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: str): - split_genomes_path = split_options.split_genomes_path - if not os.path.exists(split_options.split_genomes_path): - safe_makedirs(split_genomes_path) - - task_file_dir = os.path.join(split_genomes_path, build_id, indexer) - task_file = os.path.join(task_file_dir, TASK_FILE_NAME) - write_run_data_manager_to_file(run_data_manager, task_file) - +def walk_over_incomplete_runs(split_options: SplitOptions): data_managers = read_data_managers_configuration(split_options.data_managers_path) with open(split_options.merged_genomes_path) as f: genomes_all = yaml.safe_load(f) genomes = genomes_all["genomes"] for genome in genomes: build_id = genome["id"] + if split_options.filters.filter_out_build_id(build_id): + continue fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta" - if not split_options.is_build_complete(build_id, fetch_indexer): + do_fetch = not split_options.filters.filter_out_data_manager(fetch_indexer) + source = genome.get("source") + if source is None: + do_fetch = False + if do_fetch and split_options.filters.filter_out_stage(0): + do_fetch = False + + if do_fetch and not split_options.is_build_complete(build_id, fetch_indexer): log.info(f"Fetching: {build_id}") fetch_tool_id = tool_id_for(fetch_indexer, data_managers) fetch_params = [] fetch_params.append({"dbkey_source|dbkey": genome["id"]}) source = genome.get("source") - if source is None: - continue - elif source == "ucsc": + if source == "ucsc": fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) fetch_params.append({"sequence_name": genome["description"]}) @@ -146,12 +159,18 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st # Not needed according to Marius # data_table_reload=["all_fasta", "__dbkeys__"], ) - write_task_file(fetch_run_data_manager, build_id, fetch_indexer) + yield (build_id, fetch_indexer, fetch_run_data_manager) else: log.debug(f"Fetch is already completed: {build_id}") indexers = genome.get("indexers", []) for indexer in indexers: + if split_options.filters.filter_out_data_manager(indexer): + continue + + if split_options.filters.filter_out_stage(1): + continue + if split_options.is_build_complete(build_id, indexer): log.debug(f"Build is already completed: {build_id} {indexer}") continue @@ -179,7 +198,22 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st params=params, items=[item], ) - write_task_file(run_data_manager, build_id, indexer) + yield (build_id, indexer, run_data_manager) + + +def split_genomes(split_options: SplitOptions) -> None: + + def write_task_file(build_id: str, indexer: str, run_data_manager: RunDataManager): + split_genomes_path = split_options.split_genomes_path + if not os.path.exists(split_options.split_genomes_path): + safe_makedirs(split_genomes_path) + + task_file_dir = os.path.join(split_genomes_path, build_id, indexer) + task_file = os.path.join(task_file_dir, TASK_FILE_NAME) + write_run_data_manager_to_file(run_data_manager, task_file) + + for build_id, indexer, run_data_manager in walk_over_incomplete_runs(split_options): + write_task_file(build_id, indexer, run_data_manager) class GalaxyHistoryIsBuildComplete: @@ -199,6 +233,12 @@ def _parser(): parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") parser.add_argument('--data-managers-path', default="data_managers.yml") + + # filters + parser.add_argument('--filter-stage', default=None) + parser.add_argument('--filter-data-manager', default=None) + parser.add_argument('--filter-build-id', default=None) + return parser @@ -225,6 +265,12 @@ def main(): split_options.split_genomes_path = args.split_genomes_path split_options.is_build_complete = is_build_complete + filters = Filters() + filters.build_id = args.filter_build_id + filters.data_manager = args.filter_data_manager + filters.stage = args.filter_stage + split_options.filters = filters + split_genomes(split_options) diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index 5dc7c5a..fbddcef 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -3,6 +3,7 @@ import yaml from ephemeris._idc_split_data_manager_genomes import ( + Filters, GalaxyHistoryIsBuildComplete, RunDataManagers, split_genomes, @@ -62,19 +63,22 @@ def read_and_validate_run_data_manager_yaml(path): return RunDataManagers(**yaml.safe_load(f)) -def test_split_genomes(tmp_path: Path): - setup_mock_idc_dir(tmp_path) - - split_path = tmp_path / "split" - +def split_options_for(tmp_path: Path) -> SplitOptions: history_names = ["idc-hg19_rCRS_pUC18_phiX174-data_manager_star_index_builder"] is_build_complete = GalaxyHistoryIsBuildComplete(history_names) split_options = SplitOptions() split_options.merged_genomes_path = tmp_path / "genomes.yml" - split_options.split_genomes_path = str(split_path) + split_options.split_genomes_path = str(tmp_path / "split") split_options.data_managers_path = tmp_path / "data_managers.yml" split_options.is_build_complete = is_build_complete + return split_options + + +def test_split_genomes(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) split_genomes(split_options) new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" complete_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_star_index_builder" @@ -89,3 +93,68 @@ def test_split_genomes(tmp_path: Path): assert data_manager.id == "toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2" assert data_manager.items[0]["id"] == "hg19_rCRS_pUC18_phiX174" assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174" + + +def test_split_genomes_filter_on_data_manager(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.data_manager = "data_manager_star_index_builder" + split_options.filters = filters + + split_genomes(split_options) + new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert not new_task.exists() + + filters.data_manager = "data_manager_twobit_builder" + split_genomes(split_options) + assert new_task.exists() + + +def test_split_genomes_filter_on_build_id(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.build_id = "rn6" + split_options.filters = filters + + split_genomes(split_options) + filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert not filtered_out_task.exists() + + filtered_in_task = split_path / "rn6" / "data_manager_twobit_builder" + assert filtered_in_task.exists() + + +def test_split_genomes_filter_on_stage_0(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.stage = 0 + split_options.filters = filters + + split_genomes(split_options) + filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert not filtered_out_task.exists() + + filtered_in_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_fetch_genome_dbkeys_all_fasta" + assert filtered_in_task.exists() + + +def test_split_genomes_filter_on_stage_1(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.stage = 1 + split_options.filters = filters + + split_genomes(split_options) + filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_fetch_genome_dbkeys_all_fasta" + assert not filtered_out_task.exists() + + filtered_in_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert filtered_in_task.exists() From 7dc7107c0cd384c2dcb5f366cc08d3167115c57c Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 16:22:15 -0400 Subject: [PATCH 18/31] New defaults... EPHEMERIS_GALAXY and EPHEMERIS_API_KEY --- src/ephemeris/__init__.py | 3 ++- src/ephemeris/common_parser.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ephemeris/__init__.py b/src/ephemeris/__init__.py index d98838a..11d33ac 100644 --- a/src/ephemeris/__init__.py +++ b/src/ephemeris/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os import yaml from bioblend import galaxy @@ -47,7 +48,7 @@ def get_galaxy_connection(args, file=None, log=None, login_required=True): url = args.galaxy or file_content.get("galaxy_instance") galaxy_url = check_url(url, log) - api_key = args.api_key or file_content.get("api_key") + api_key = args.api_key or file_content.get("api_key") or os.environ.get("EPHEMERIS_API_KEY") if args.user and args.password: return galaxy.GalaxyInstance( diff --git a/src/ephemeris/common_parser.py b/src/ephemeris/common_parser.py index 569aede..30924bf 100644 --- a/src/ephemeris/common_parser.py +++ b/src/ephemeris/common_parser.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import argparse +import os class HideUnderscoresHelpFormatter(argparse.HelpFormatter): @@ -43,11 +44,12 @@ def get_common_args(login_required=True, log_file=False): add_log_file_argument(general_group) con_group = parser.add_argument_group("Galaxy connection") + default_galaxy = os.environ.get("EPHEMERIS_GALAXY") or "http://localhost:8080" con_group.add_argument( "-g", "--galaxy", help="Target Galaxy instance URL/IP address", - default="http://localhost:8080", + default=default_galaxy, ) if login_required: From fc3511a0130b229beaf426053e90e30de3b3d577 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 16:28:52 -0400 Subject: [PATCH 19/31] Skip broken data table checking logic if no data table reloads are found... --- src/ephemeris/run_data_managers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index 9dcdc7b..ba98906 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -222,6 +222,11 @@ def data_table_entry_exists(self, data_table_name, entry, column="value"): def input_entries_exist_in_data_tables(self, data_tables, input_dict): """Checks whether name and value entries from the input are already present in the data tables. If an entry is missing in of the tables, this function returns False""" + if data_tables is None or len(data_tables) == 0: + # this logic is all broken I (@jmchilton) think, but lets just skip it all + # if we know we don't have data tables to check + return False + value_entry = get_first_valid_entry(input_dict, self.possible_value_keys) name_entry = get_first_valid_entry(input_dict, self.possible_name_keys) From d1752f8035fa5952233f72b9c6c26115c1df75bf Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sat, 1 Jul 2023 17:06:42 -0400 Subject: [PATCH 20/31] Implement --tool-id-mode=short option to get short IDs for run-data-managers --- .../_idc_split_data_manager_genomes.py | 22 +++++++++++++++---- tests/test_split_genomes.py | 15 +++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index ee68441..be5c253 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -40,6 +40,7 @@ IsBuildComplete = Callable[[str, str], bool] TASK_FILE_NAME = "run_data_managers.yaml" +DEFAULT_TOOL_ID_MODE = "tool_shed_guid" log = logging.getLogger(__name__) @@ -64,13 +65,23 @@ class SplitOptions: split_genomes_path: str data_managers_path: str is_build_complete: IsBuildComplete + tool_id_mode: str = DEFAULT_TOOL_ID_MODE filters: Filters = Filters() -def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str: +def tool_id_for(indexer: str, data_managers: Dict[str, DataManager], mode: str) -> str: data_manager = data_managers[indexer] assert data_manager, f"Could not find a target data manager for indexer name {indexer}" - return data_manager.tool_id + tool_shed_guid = data_manager.tool_id + if mode == "short": + _ts, _, _owner, _repo_name, rest = tool_shed_guid.split("/", 4) + if "/" in rest: + print(rest) + return rest.split("/")[0] + else: + return rest + else: + return tool_shed_guid class RunDataManager(BaseModel): @@ -130,7 +141,7 @@ def walk_over_incomplete_runs(split_options: SplitOptions): if do_fetch and not split_options.is_build_complete(build_id, fetch_indexer): log.info(f"Fetching: {build_id}") - fetch_tool_id = tool_id_for(fetch_indexer, data_managers) + fetch_tool_id = tool_id_for(fetch_indexer, data_managers, split_options.tool_id_mode) fetch_params = [] fetch_params.append({"dbkey_source|dbkey": genome["id"]}) source = genome.get("source") @@ -177,7 +188,7 @@ def walk_over_incomplete_runs(split_options: SplitOptions): log.info(f"Building: {build_id} {indexer}") - tool_id = tool_id_for(indexer, data_managers) + tool_id = tool_id_for(indexer, data_managers, split_options.tool_id_mode) params = [ {"all_fasta_source": "{{ item.id }}"}, {"sequence_name": "{{ item.name }}"}, @@ -234,6 +245,8 @@ def _parser(): parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") parser.add_argument('--data-managers-path', default="data_managers.yml") + parser.add_argument("--tool-id-mode", choices=["tool_shed_guid", "short"], default=DEFAULT_TOOL_ID_MODE) + # filters parser.add_argument('--filter-stage', default=None) parser.add_argument('--filter-data-manager', default=None) @@ -264,6 +277,7 @@ def main(): split_options.merged_genomes_path = args.merged_genomes_path split_options.split_genomes_path = args.split_genomes_path split_options.is_build_complete = is_build_complete + split_options.tool_id_mode = args.tool_id_mode filters = Filters() filters.build_id = args.filter_build_id diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index fbddcef..2ebb39f 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -95,6 +95,21 @@ def test_split_genomes(tmp_path: Path): assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174" +def test_split_genomes_short_ids(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + split_options.tool_id_mode = "short" + split_genomes(split_options) + + new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + new_task_run_yaml = new_task / "run_data_managers.yaml" + run = read_and_validate_run_data_manager_yaml(new_task_run_yaml) + assert len(run.data_managers) == 1 + data_manager = run.data_managers[0] + assert data_manager.id == "twobit_builder_data_manager" + + def test_split_genomes_filter_on_data_manager(tmp_path: Path): setup_mock_idc_dir(tmp_path) split_path = tmp_path / "split" From 0ca7489e6d050bb92ba6050b50764c964fca2541 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sun, 2 Jul 2023 19:10:52 -0400 Subject: [PATCH 21/31] Try a newer galaxy. --- tests/conftest.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index ee4c3e0..75fca90 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,11 @@ # It needs to work well with dev. Alternatively we can pin this to 'master' or another stable branch. # Preferably a branch that updates with each stable release -GALAXY_IMAGE = "bgruening/galaxy-stable:20.05" +DOCKER_IMAGE = "galaxy-min" # or bgruening +GALAXY_IMAGE_MIN = "galaxy/galaxy-min:23.1-auto" +GALAXY_IMAGE_BGRUENING = "bgruening/galaxy-stable:20.05" +GALAXY_IMAGE = GALAXY_IMAGE_MIN if DOCKER_IMAGE == "galaxy-min" else GALAXY_IMAGE_BGRUENING +GALAXY_PORT = "80" if DOCKER_IMAGE == "bgruening" else "8080" GALAXY_ADMIN_KEY = "fakekey" GALAXY_ADMIN_PASSWORD = "password" GALAXY_ADMIN_USER = "admin@galaxy.org" @@ -28,11 +32,21 @@ def start_container(**kwargs): # and use the internal ip address instead. # But alas, the trappings of a proprietary BSD kernel compel us to do ugly workarounds. key = kwargs.get("api_key", GALAXY_ADMIN_KEY) - ensure_admin = kwargs.get("ensure_admin", True) + + # Set this to False as we try to get bootstrap working with galaxy-min. + # ensure_admin = kwargs.get("ensure_admin", True) + ensure_admin = False client = docker.from_env() + + if DOCKER_IMAGE != "bgruening": + if "environment" not in kwargs: + kwargs["environment"] = {} + environment = kwargs["environment"] + environment["GALAXY_CONFIG_OVERRIDE_BOOTSTRAP_ADMIN_API_KEY"] = GALAXY_ADMIN_KEY + container = client.containers.run( - GALAXY_IMAGE, detach=True, ports={"80/tcp": None}, **kwargs + GALAXY_IMAGE, detach=True, ports={f"{GALAXY_PORT}/tcp": None}, **kwargs ) container_id = container.attrs.get("Id") print(container_id) @@ -46,7 +60,7 @@ def start_container(**kwargs): exposed_port = ( container_attributes.get("NetworkSettings") .get("Ports") - .get("80/tcp")[0] + .get(f"{GALAXY_PORT}/tcp")[0] .get("HostPort") ) @@ -56,6 +70,7 @@ def start_container(**kwargs): container_url, timeout=180, api_key=key, ensure_admin=ensure_admin ) if not ready: + print(client.containers.get(container_id).logs()) raise Exception("Failed to wait on Galaxy to start.") gi = GalaxyInstance(container_url, key=key) yield GalaxyContainer( From 6fc5d9be9e1436da04f6e338625b7d312bae2766 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sun, 2 Jul 2023 19:47:41 -0400 Subject: [PATCH 22/31] parameterize strategy for build tracking... --- .../_idc_split_data_manager_genomes.py | 46 +++++++++++++++---- tests/test_split_genomes.py | 6 +-- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index be5c253..4739867 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -12,7 +12,6 @@ from copy import deepcopy from typing import ( Any, - Callable, Dict, List, Optional, @@ -38,13 +37,19 @@ setup_global_logger, ) -IsBuildComplete = Callable[[str, str], bool] TASK_FILE_NAME = "run_data_managers.yaml" DEFAULT_TOOL_ID_MODE = "tool_shed_guid" +DEFAULT_BUILD_TRACKING_MODE = "galaxy_history" log = logging.getLogger(__name__) +class BuildTracker: + + def is_build_complete(self, build_id: str, indexer_name: str) -> bool: + pass + + class Filters: stage: Optional[int] = None data_manager: Optional[str] = None @@ -64,7 +69,7 @@ class SplitOptions: merged_genomes_path: str split_genomes_path: str data_managers_path: str - is_build_complete: IsBuildComplete + build_tracker: BuildTracker tool_id_mode: str = DEFAULT_TOOL_ID_MODE filters: Filters = Filters() @@ -139,7 +144,7 @@ def walk_over_incomplete_runs(split_options: SplitOptions): if do_fetch and split_options.filters.filter_out_stage(0): do_fetch = False - if do_fetch and not split_options.is_build_complete(build_id, fetch_indexer): + if do_fetch and not split_options.build_tracker.is_build_complete(build_id, fetch_indexer): log.info(f"Fetching: {build_id}") fetch_tool_id = tool_id_for(fetch_indexer, data_managers, split_options.tool_id_mode) fetch_params = [] @@ -182,7 +187,7 @@ def walk_over_incomplete_runs(split_options: SplitOptions): if split_options.filters.filter_out_stage(1): continue - if split_options.is_build_complete(build_id, indexer): + if split_options.build_tracker.is_build_complete(build_id, indexer): log.debug(f"Build is already completed: {build_id} {indexer}") continue @@ -227,16 +232,30 @@ def write_task_file(build_id: str, indexer: str, run_data_manager: RunDataManage write_task_file(build_id, indexer, run_data_manager) -class GalaxyHistoryIsBuildComplete: +class GalaxyHistoryBuildTracker(BuildTracker): def __init__(self, history_names: List[str]): self._history_names = history_names - def __call__(self, build_id: str, indexer_name: str) -> bool: + def is_build_complete(self, build_id: str, indexer_name: str) -> bool: target_history_name = f"idc-{build_id}-{indexer_name}" return target_history_name in self._history_names +class DirectoryBuildTracker(BuildTracker): + + def __init__(self, split_genomes_path: str): + self._split_genomes_path = split_genomes_path + + def is_build_complete(self, build_id: str, indexer_name: str) -> bool: + target_directory = os.path.join(self._split_genomes_path, build_id, indexer_name) + if not os.path.exists(target_directory): + return False + bundle_path = os.path.join(target_directory, "data_bundle.tgz") + if not os.path.exists(bundle_path): + return False + + def _parser(): """returns the parser object.""" # login required to check history... @@ -246,6 +265,7 @@ def _parser(): parser.add_argument('--data-managers-path', default="data_managers.yml") parser.add_argument("--tool-id-mode", choices=["tool_shed_guid", "short"], default=DEFAULT_TOOL_ID_MODE) + parser.add_argument("--build-tracking", choices=["galaxy_history", "directory"], default=DEFAULT_BUILD_TRACKING_MODE) # filters parser.add_argument('--filter-stage', default=None) @@ -260,6 +280,14 @@ def get_galaxy_history_names(args) -> List[str]: return [h["name"] for h in gi.histories.get_histories()] +def get_build_tracker(args): + if args.build_tracking == "galaxy_history": + build_tracker = GalaxyHistoryBuildTracker(get_galaxy_history_names(args)) + else: + build_tracker = DirectoryBuildTracker(args.split_genomes_path) + return build_tracker + + def main(): disable_external_library_logging() parser = _parser() @@ -270,13 +298,11 @@ def main(): else: log.setLevel(logging.INFO) - is_build_complete = GalaxyHistoryIsBuildComplete(get_galaxy_history_names(args)) - split_options = SplitOptions() split_options.data_managers_path = args.data_managers_path split_options.merged_genomes_path = args.merged_genomes_path split_options.split_genomes_path = args.split_genomes_path - split_options.is_build_complete = is_build_complete + split_options.build_tracker = get_build_tracker(args) split_options.tool_id_mode = args.tool_id_mode filters = Filters() diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index 2ebb39f..6108f17 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -4,7 +4,7 @@ from ephemeris._idc_split_data_manager_genomes import ( Filters, - GalaxyHistoryIsBuildComplete, + GalaxyHistoryBuildTracker, RunDataManagers, split_genomes, SplitOptions, @@ -65,13 +65,13 @@ def read_and_validate_run_data_manager_yaml(path): def split_options_for(tmp_path: Path) -> SplitOptions: history_names = ["idc-hg19_rCRS_pUC18_phiX174-data_manager_star_index_builder"] - is_build_complete = GalaxyHistoryIsBuildComplete(history_names) + build_tracker = GalaxyHistoryBuildTracker(history_names) split_options = SplitOptions() split_options.merged_genomes_path = tmp_path / "genomes.yml" split_options.split_genomes_path = str(tmp_path / "split") split_options.data_managers_path = tmp_path / "data_managers.yml" - split_options.is_build_complete = is_build_complete + split_options.build_tracker = build_tracker return split_options From 7bac8767bfcb513ff7e0f169dd69dfcae488ebe6 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Sun, 2 Jul 2023 20:18:56 -0400 Subject: [PATCH 23/31] Refactor splitting for reuse in ephemeris data running. --- src/ephemeris/_idc_build.py | 308 ++++++++++++++++++ .../_idc_split_data_manager_genomes.py | 300 +---------------- tests/test_split_genomes.py | 11 +- 3 files changed, 328 insertions(+), 291 deletions(-) create mode 100644 src/ephemeris/_idc_build.py diff --git a/src/ephemeris/_idc_build.py b/src/ephemeris/_idc_build.py new file mode 100644 index 0000000..df1a2b9 --- /dev/null +++ b/src/ephemeris/_idc_build.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python +"""Build script that takes an IDC-style configuration and builds relevant data. + +The IDC splits things out and manages runs in stages in order to deal with CVMFS, +but for most deployments one should be able to just run the data managers directly +and have the data installed or fetch data bundles. This will be that script and +will reuse a lot of what is in run_data_managers and will work with a version of +the data bundle fetching adapted for ephemeris. +""" +import logging +import os +import re +from copy import deepcopy +from typing import ( + Any, + Dict, + List, + Optional, +) + +import yaml +from galaxy.util import safe_makedirs +from pydantic import ( + BaseModel, + Extra, +) + +from . import get_galaxy_connection +from ._idc_data_managers_to_tools import ( + DataManager, + read_data_managers_configuration, +) +from .common_parser import ( + get_common_args, +) +from .ephemeris_log import ( + disable_external_library_logging, + setup_global_logger, +) + +TASK_FILE_NAME = "run_data_managers.yaml" +DEFAULT_TOOL_ID_MODE = "tool_shed_guid" +DEFAULT_BUILD_TRACKING_MODE = "galaxy_history" + +log = logging.getLogger(__name__) + + +class BuildTracker: + + def is_build_complete(self, build_id: str, indexer_name: str) -> bool: + pass + + +class Filters: + stage: Optional[int] = None + data_manager: Optional[str] = None + build_id: Optional[str] = None + + def filter_out_data_manager(self, data_manager: str) -> bool: + return bool(self.data_manager and data_manager != self.data_manager) + + def filter_out_build_id(self, build_id: str) -> bool: + return bool(self.build_id and build_id != self.build_id) + + def filter_out_stage(self, stage: int) -> bool: + return bool(self.stage is not None and self.stage != stage) + + +class BuildOptions: + merged_genomes_path: str + split_genomes_path: str + data_managers_path: str + build_tracker: BuildTracker + tool_id_mode: str = DEFAULT_TOOL_ID_MODE + filters: Filters = Filters() + + +def tool_id_for(indexer: str, data_managers: Dict[str, DataManager], mode: str) -> str: + data_manager = data_managers[indexer] + assert data_manager, f"Could not find a target data manager for indexer name {indexer}" + tool_shed_guid = data_manager.tool_id + if mode == "short": + _ts, _, _owner, _repo_name, rest = tool_shed_guid.split("/", 4) + if "/" in rest: + print(rest) + return rest.split("/")[0] + else: + return rest + else: + return tool_shed_guid + + +class RunDataManager(BaseModel): + id: str + items: Optional[List[Any]] = None + params: Optional[List[Any]] = None + data_table_reload: Optional[List[str]] = None + + +class RunDataManagers(BaseModel): + data_managers: List[RunDataManager] + + +class DataManager(BaseModel, extra=Extra.forbid): + tags: List[str] + tool_id: str + + +class DataManagers(BaseModel, extra=Extra.forbid): + __root__: Dict[str, DataManager] + + +class Genome(BaseModel): + pass + + +class Genomes(BaseModel): + genomes: List[Genome] + + +def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str): + parent, _ = os.path.split(path) + if not os.path.exists(parent): + safe_makedirs(parent) + run_data_managers = RunDataManagers(data_managers=[run_data_manager]) + with open(path, "w") as of: + yaml.safe_dump(run_data_managers.dict(exclude_unset=True), of) + + +def walk_over_incomplete_runs(build_options: BuildOptions): + data_managers = read_data_managers_configuration(build_options.data_managers_path) + with open(build_options.merged_genomes_path) as f: + genomes_all = yaml.safe_load(f) + genomes = genomes_all["genomes"] + for genome in genomes: + build_id = genome["id"] + if build_options.filters.filter_out_build_id(build_id): + continue + + fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta" + do_fetch = not build_options.filters.filter_out_data_manager(fetch_indexer) + source = genome.get("source") + if source is None: + do_fetch = False + if do_fetch and build_options.filters.filter_out_stage(0): + do_fetch = False + + if do_fetch and not build_options.build_tracker.is_build_complete(build_id, fetch_indexer): + log.info(f"Fetching: {build_id}") + fetch_tool_id = tool_id_for(fetch_indexer, data_managers, build_options.tool_id_mode) + fetch_params = [] + fetch_params.append({"dbkey_source|dbkey": genome["id"]}) + source = genome.get("source") + if source == "ucsc": + fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) + fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) + fetch_params.append({"sequence_name": genome["description"]}) + elif re.match("^[A-Z_]+[0-9.]+", source): + fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) + fetch_params.append({"reference_source|reference_source_selector": "ncbi"}) + fetch_params.append( + {"reference_source|requested_identifier": source} + ) + fetch_params.append({"sequence_name": genome["description"]}) + fetch_params.append({"sequence.id": genome["id"]}) + elif re.match("^http", source): + fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) + fetch_params.append({"reference_source|reference_source_selector": "url"}) + fetch_params.append({"reference_source|user_url": source}) + fetch_params.append({"sequence_name": genome["description"]}) + fetch_params.append({"sequence.id": genome["id"]}) + + fetch_run_data_manager = RunDataManager( + id=fetch_tool_id, + params=fetch_params, + # Not needed according to Marius + # data_table_reload=["all_fasta", "__dbkeys__"], + ) + yield (build_id, fetch_indexer, fetch_run_data_manager) + else: + log.debug(f"Fetch is already completed: {build_id}") + + indexers = genome.get("indexers", []) + for indexer in indexers: + if build_options.filters.filter_out_data_manager(indexer): + continue + + if build_options.filters.filter_out_stage(1): + continue + + if build_options.build_tracker.is_build_complete(build_id, indexer): + log.debug(f"Build is already completed: {build_id} {indexer}") + continue + + log.info(f"Building: {build_id} {indexer}") + + tool_id = tool_id_for(indexer, data_managers, build_options.tool_id_mode) + params = [ + {"all_fasta_source": "{{ item.id }}"}, + {"sequence_name": "{{ item.name }}"}, + {"sequence_id": "{{ item.id }}"}, + ] + # why is this not pulled from the data managers conf? -nate + if re.search("bwa", tool_id): + params.append({"index_algorithm": "bwtsw"}) + if re.search("color_space", tool_id): + continue + + item = deepcopy(genome) + item.pop("indexers", None) + item.pop("skiplist", None) + + run_data_manager = RunDataManager( + id=tool_id, + params=params, + items=[item], + ) + yield (build_id, indexer, run_data_manager) + + +class GalaxyHistoryBuildTracker(BuildTracker): + + def __init__(self, history_names: List[str]): + self._history_names = history_names + + def is_build_complete(self, build_id: str, indexer_name: str) -> bool: + target_history_name = f"idc-{build_id}-{indexer_name}" + return target_history_name in self._history_names + + +class DirectoryBuildTracker(BuildTracker): + + def __init__(self, split_genomes_path: str): + self._split_genomes_path = split_genomes_path + + def is_build_complete(self, build_id: str, indexer_name: str) -> bool: + target_directory = os.path.join(self._split_genomes_path, build_id, indexer_name) + if not os.path.exists(target_directory): + return False + bundle_path = os.path.join(target_directory, "data_bundle.tgz") + if not os.path.exists(bundle_path): + return False + + +def _parser(): + """returns the parser object.""" + # login required to check history... + parser = get_common_args(login_required=True, log_file=True) + parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") + parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") + parser.add_argument('--data-managers-path', default="data_managers.yml") + + parser.add_argument("--tool-id-mode", choices=["tool_shed_guid", "short"], default=DEFAULT_TOOL_ID_MODE) + parser.add_argument("--build-tracking", choices=["galaxy_history", "directory"], default=DEFAULT_BUILD_TRACKING_MODE) + + # filters + parser.add_argument('--filter-stage', default=None) + parser.add_argument('--filter-data-manager', default=None) + parser.add_argument('--filter-build-id', default=None) + + return parser + + +def get_galaxy_history_names(args) -> List[str]: + gi = get_galaxy_connection(args, login_required=True) + return [h["name"] for h in gi.histories.get_histories()] + + +def get_build_tracker(args): + if args.build_tracking == "galaxy_history": + build_tracker = GalaxyHistoryBuildTracker(get_galaxy_history_names(args)) + else: + build_tracker = DirectoryBuildTracker(args.split_genomes_path) + return build_tracker + + +def configure_python_for_build() -> BuildOptions: + disable_external_library_logging() + parser = _parser() + args = parser.parse_args() + log = setup_global_logger(name=__name__, log_file=args.log_file) + if args.verbose: + log.setLevel(logging.DEBUG) + else: + log.setLevel(logging.INFO) + + build_options = BuildOptions() + build_options.data_managers_path = args.data_managers_path + build_options.merged_genomes_path = args.merged_genomes_path + build_options.split_genomes_path = args.split_genomes_path + build_options.build_tracker = get_build_tracker(args) + build_options.tool_id_mode = args.tool_id_mode + + filters = Filters() + filters.build_id = args.filter_build_id + filters.data_manager = args.filter_data_manager + filters.stage = args.filter_stage + build_options.filters = filters + + +def main(): + build_options = configure_python_for_build() + # TODO: build request genomes... + print(build_options) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py index 4739867..87e0456 100644 --- a/src/ephemeris/_idc_split_data_manager_genomes.py +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -6,312 +6,38 @@ by genomes.yml that have already been executed and appear in the target installed data table configuration. """ -import logging import os -import re -from copy import deepcopy -from typing import ( - Any, - Dict, - List, - Optional, -) -import yaml from galaxy.util import safe_makedirs -from pydantic import ( - BaseModel, - Extra, -) -from . import get_galaxy_connection -from ._idc_data_managers_to_tools import ( - DataManager, - read_data_managers_configuration, -) -from .common_parser import ( - get_common_args, -) -from .ephemeris_log import ( - disable_external_library_logging, - setup_global_logger, +from ._idc_build import ( + BuildOptions, + configure_python_for_build, + RunDataManager, + TASK_FILE_NAME, + walk_over_incomplete_runs, + write_run_data_manager_to_file, ) -TASK_FILE_NAME = "run_data_managers.yaml" -DEFAULT_TOOL_ID_MODE = "tool_shed_guid" -DEFAULT_BUILD_TRACKING_MODE = "galaxy_history" - -log = logging.getLogger(__name__) - - -class BuildTracker: - - def is_build_complete(self, build_id: str, indexer_name: str) -> bool: - pass - - -class Filters: - stage: Optional[int] = None - data_manager: Optional[str] = None - build_id: Optional[str] = None - - def filter_out_data_manager(self, data_manager: str) -> bool: - return bool(self.data_manager and data_manager != self.data_manager) - - def filter_out_build_id(self, build_id: str) -> bool: - return bool(self.build_id and build_id != self.build_id) - - def filter_out_stage(self, stage: int) -> bool: - return bool(self.stage is not None and self.stage != stage) - - -class SplitOptions: - merged_genomes_path: str - split_genomes_path: str - data_managers_path: str - build_tracker: BuildTracker - tool_id_mode: str = DEFAULT_TOOL_ID_MODE - filters: Filters = Filters() - - -def tool_id_for(indexer: str, data_managers: Dict[str, DataManager], mode: str) -> str: - data_manager = data_managers[indexer] - assert data_manager, f"Could not find a target data manager for indexer name {indexer}" - tool_shed_guid = data_manager.tool_id - if mode == "short": - _ts, _, _owner, _repo_name, rest = tool_shed_guid.split("/", 4) - if "/" in rest: - print(rest) - return rest.split("/")[0] - else: - return rest - else: - return tool_shed_guid - - -class RunDataManager(BaseModel): - id: str - items: Optional[List[Any]] = None - params: Optional[List[Any]] = None - data_table_reload: Optional[List[str]] = None - - -class RunDataManagers(BaseModel): - data_managers: List[RunDataManager] - - -class DataManager(BaseModel, extra=Extra.forbid): - tags: List[str] - tool_id: str - - -class DataManagers(BaseModel, extra=Extra.forbid): - __root__: Dict[str, DataManager] - - -class Genome(BaseModel): - pass - - -class Genomes(BaseModel): - genomes: List[Genome] - - -def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str): - parent, _ = os.path.split(path) - if not os.path.exists(parent): - safe_makedirs(parent) - run_data_managers = RunDataManagers(data_managers=[run_data_manager]) - with open(path, "w") as of: - yaml.safe_dump(run_data_managers.dict(exclude_unset=True), of) - -def walk_over_incomplete_runs(split_options: SplitOptions): - data_managers = read_data_managers_configuration(split_options.data_managers_path) - with open(split_options.merged_genomes_path) as f: - genomes_all = yaml.safe_load(f) - genomes = genomes_all["genomes"] - for genome in genomes: - build_id = genome["id"] - if split_options.filters.filter_out_build_id(build_id): - continue - - fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta" - do_fetch = not split_options.filters.filter_out_data_manager(fetch_indexer) - source = genome.get("source") - if source is None: - do_fetch = False - if do_fetch and split_options.filters.filter_out_stage(0): - do_fetch = False - - if do_fetch and not split_options.build_tracker.is_build_complete(build_id, fetch_indexer): - log.info(f"Fetching: {build_id}") - fetch_tool_id = tool_id_for(fetch_indexer, data_managers, split_options.tool_id_mode) - fetch_params = [] - fetch_params.append({"dbkey_source|dbkey": genome["id"]}) - source = genome.get("source") - if source == "ucsc": - fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) - fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) - fetch_params.append({"sequence_name": genome["description"]}) - elif re.match("^[A-Z_]+[0-9.]+", source): - fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) - fetch_params.append({"reference_source|reference_source_selector": "ncbi"}) - fetch_params.append( - {"reference_source|requested_identifier": source} - ) - fetch_params.append({"sequence_name": genome["description"]}) - fetch_params.append({"sequence.id": genome["id"]}) - elif re.match("^http", source): - fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) - fetch_params.append({"reference_source|reference_source_selector": "url"}) - fetch_params.append({"reference_source|user_url": source}) - fetch_params.append({"sequence_name": genome["description"]}) - fetch_params.append({"sequence.id": genome["id"]}) - - fetch_run_data_manager = RunDataManager( - id=fetch_tool_id, - params=fetch_params, - # Not needed according to Marius - # data_table_reload=["all_fasta", "__dbkeys__"], - ) - yield (build_id, fetch_indexer, fetch_run_data_manager) - else: - log.debug(f"Fetch is already completed: {build_id}") - - indexers = genome.get("indexers", []) - for indexer in indexers: - if split_options.filters.filter_out_data_manager(indexer): - continue - - if split_options.filters.filter_out_stage(1): - continue - - if split_options.build_tracker.is_build_complete(build_id, indexer): - log.debug(f"Build is already completed: {build_id} {indexer}") - continue - - log.info(f"Building: {build_id} {indexer}") - - tool_id = tool_id_for(indexer, data_managers, split_options.tool_id_mode) - params = [ - {"all_fasta_source": "{{ item.id }}"}, - {"sequence_name": "{{ item.name }}"}, - {"sequence_id": "{{ item.id }}"}, - ] - # why is this not pulled from the data managers conf? -nate - if re.search("bwa", tool_id): - params.append({"index_algorithm": "bwtsw"}) - if re.search("color_space", tool_id): - continue - - item = deepcopy(genome) - item.pop("indexers", None) - item.pop("skiplist", None) - - run_data_manager = RunDataManager( - id=tool_id, - params=params, - items=[item], - ) - yield (build_id, indexer, run_data_manager) - - -def split_genomes(split_options: SplitOptions) -> None: +def split_genomes(build_options: BuildOptions) -> None: def write_task_file(build_id: str, indexer: str, run_data_manager: RunDataManager): - split_genomes_path = split_options.split_genomes_path - if not os.path.exists(split_options.split_genomes_path): + split_genomes_path = build_options.split_genomes_path + if not os.path.exists(build_options.split_genomes_path): safe_makedirs(split_genomes_path) task_file_dir = os.path.join(split_genomes_path, build_id, indexer) task_file = os.path.join(task_file_dir, TASK_FILE_NAME) write_run_data_manager_to_file(run_data_manager, task_file) - for build_id, indexer, run_data_manager in walk_over_incomplete_runs(split_options): + for build_id, indexer, run_data_manager in walk_over_incomplete_runs(build_options): write_task_file(build_id, indexer, run_data_manager) -class GalaxyHistoryBuildTracker(BuildTracker): - - def __init__(self, history_names: List[str]): - self._history_names = history_names - - def is_build_complete(self, build_id: str, indexer_name: str) -> bool: - target_history_name = f"idc-{build_id}-{indexer_name}" - return target_history_name in self._history_names - - -class DirectoryBuildTracker(BuildTracker): - - def __init__(self, split_genomes_path: str): - self._split_genomes_path = split_genomes_path - - def is_build_complete(self, build_id: str, indexer_name: str) -> bool: - target_directory = os.path.join(self._split_genomes_path, build_id, indexer_name) - if not os.path.exists(target_directory): - return False - bundle_path = os.path.join(target_directory, "data_bundle.tgz") - if not os.path.exists(bundle_path): - return False - - -def _parser(): - """returns the parser object.""" - # login required to check history... - parser = get_common_args(login_required=True, log_file=True) - parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") - parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") - parser.add_argument('--data-managers-path', default="data_managers.yml") - - parser.add_argument("--tool-id-mode", choices=["tool_shed_guid", "short"], default=DEFAULT_TOOL_ID_MODE) - parser.add_argument("--build-tracking", choices=["galaxy_history", "directory"], default=DEFAULT_BUILD_TRACKING_MODE) - - # filters - parser.add_argument('--filter-stage', default=None) - parser.add_argument('--filter-data-manager', default=None) - parser.add_argument('--filter-build-id', default=None) - - return parser - - -def get_galaxy_history_names(args) -> List[str]: - gi = get_galaxy_connection(args, login_required=True) - return [h["name"] for h in gi.histories.get_histories()] - - -def get_build_tracker(args): - if args.build_tracking == "galaxy_history": - build_tracker = GalaxyHistoryBuildTracker(get_galaxy_history_names(args)) - else: - build_tracker = DirectoryBuildTracker(args.split_genomes_path) - return build_tracker - - def main(): - disable_external_library_logging() - parser = _parser() - args = parser.parse_args() - log = setup_global_logger(name=__name__, log_file=args.log_file) - if args.verbose: - log.setLevel(logging.DEBUG) - else: - log.setLevel(logging.INFO) - - split_options = SplitOptions() - split_options.data_managers_path = args.data_managers_path - split_options.merged_genomes_path = args.merged_genomes_path - split_options.split_genomes_path = args.split_genomes_path - split_options.build_tracker = get_build_tracker(args) - split_options.tool_id_mode = args.tool_id_mode - - filters = Filters() - filters.build_id = args.filter_build_id - filters.data_manager = args.filter_data_manager - filters.stage = args.filter_stage - split_options.filters = filters - - split_genomes(split_options) + build_options = configure_python_for_build() + split_genomes(build_options) if __name__ == "__main__": diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index 6108f17..7d90e28 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -2,14 +2,17 @@ import yaml -from ephemeris._idc_split_data_manager_genomes import ( +from ephemeris._idc_build import ( + BuildOptions, Filters, GalaxyHistoryBuildTracker, RunDataManagers, +) +from ephemeris._idc_split_data_manager_genomes import ( split_genomes, - SplitOptions, ) + MERGED_YAML_STR = """ genomes: - dbkey: hg19_rCRS_pUC18_phiX174 @@ -63,11 +66,11 @@ def read_and_validate_run_data_manager_yaml(path): return RunDataManagers(**yaml.safe_load(f)) -def split_options_for(tmp_path: Path) -> SplitOptions: +def split_options_for(tmp_path: Path) -> BuildOptions: history_names = ["idc-hg19_rCRS_pUC18_phiX174-data_manager_star_index_builder"] build_tracker = GalaxyHistoryBuildTracker(history_names) - split_options = SplitOptions() + split_options = BuildOptions() split_options.merged_genomes_path = tmp_path / "genomes.yml" split_options.split_genomes_path = str(tmp_path / "split") split_options.data_managers_path = tmp_path / "data_managers.yml" From 0d822f1efad1bbe16fd9419b97930f54e7db792b Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 5 Jul 2023 17:24:19 -0400 Subject: [PATCH 24/31] Create a user for key. --- tests/conftest.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 75fca90..e4dd6d6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -73,6 +73,11 @@ def start_container(**kwargs): print(client.containers.get(container_id).logs()) raise Exception("Failed to wait on Galaxy to start.") gi = GalaxyInstance(container_url, key=key) + if DOCKER_IMAGE != "bgruening": + user_dict = gi.users.create_local_user("admin", GALAXY_ADMIN_USER, GALAXY_ADMIN_PASSWORD) + user_api_key = gi.users.get_user_apikey(user_dict["id"]) + gi = GalaxyInstance(container_url, key=user_api_key) + yield GalaxyContainer( url=container_url, container=container, attributes=container_attributes, gi=gi ) From 661acff3d1ed492a3178742dcf63d515edeadaaf Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 5 Jul 2023 17:29:27 -0400 Subject: [PATCH 25/31] Parameterize history build tracking prefix. --- src/ephemeris/_idc_build.py | 9 ++++++--- tests/conftest.py | 5 +++-- tests/test_split_genomes.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/ephemeris/_idc_build.py b/src/ephemeris/_idc_build.py index df1a2b9..b2e0b8d 100644 --- a/src/ephemeris/_idc_build.py +++ b/src/ephemeris/_idc_build.py @@ -41,6 +41,7 @@ TASK_FILE_NAME = "run_data_managers.yaml" DEFAULT_TOOL_ID_MODE = "tool_shed_guid" DEFAULT_BUILD_TRACKING_MODE = "galaxy_history" +DEFAULT_HISTORY_BUILD_PREFIX = "idc" log = logging.getLogger(__name__) @@ -220,11 +221,12 @@ def walk_over_incomplete_runs(build_options: BuildOptions): class GalaxyHistoryBuildTracker(BuildTracker): - def __init__(self, history_names: List[str]): + def __init__(self, history_names: List[str], history_build_prefix: str): self._history_names = history_names + self._history_build_prefix = history_build_prefix def is_build_complete(self, build_id: str, indexer_name: str) -> bool: - target_history_name = f"idc-{build_id}-{indexer_name}" + target_history_name = f"{self._history_build_prefix}-{build_id}-{indexer_name}" return target_history_name in self._history_names @@ -252,6 +254,7 @@ def _parser(): parser.add_argument("--tool-id-mode", choices=["tool_shed_guid", "short"], default=DEFAULT_TOOL_ID_MODE) parser.add_argument("--build-tracking", choices=["galaxy_history", "directory"], default=DEFAULT_BUILD_TRACKING_MODE) + parser.add_argument("--history-build-prefix", default=DEFAULT_HISTORY_BUILD_PREFIX) # filters parser.add_argument('--filter-stage', default=None) @@ -268,7 +271,7 @@ def get_galaxy_history_names(args) -> List[str]: def get_build_tracker(args): if args.build_tracking == "galaxy_history": - build_tracker = GalaxyHistoryBuildTracker(get_galaxy_history_names(args)) + build_tracker = GalaxyHistoryBuildTracker(get_galaxy_history_names(args), args.history_build_prefix) else: build_tracker = DirectoryBuildTracker(args.split_genomes_path) return build_tracker diff --git a/tests/conftest.py b/tests/conftest.py index e4dd6d6..003f4b2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -44,6 +44,7 @@ def start_container(**kwargs): kwargs["environment"] = {} environment = kwargs["environment"] environment["GALAXY_CONFIG_OVERRIDE_BOOTSTRAP_ADMIN_API_KEY"] = GALAXY_ADMIN_KEY + environment["GALAXY_CONFIG_OVERRIDE_ADMIN_USERS"] = GALAXY_ADMIN_USER container = client.containers.run( GALAXY_IMAGE, detach=True, ports={f"{GALAXY_PORT}/tcp": None}, **kwargs @@ -75,8 +76,8 @@ def start_container(**kwargs): gi = GalaxyInstance(container_url, key=key) if DOCKER_IMAGE != "bgruening": user_dict = gi.users.create_local_user("admin", GALAXY_ADMIN_USER, GALAXY_ADMIN_PASSWORD) - user_api_key = gi.users.get_user_apikey(user_dict["id"]) - gi = GalaxyInstance(container_url, key=user_api_key) + print(user_dict) + gi = GalaxyInstance(container_url, email=GALAXY_ADMIN_USER,password=GALAXY_ADMIN_PASSWORD) yield GalaxyContainer( url=container_url, container=container, attributes=container_attributes, gi=gi diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py index 7d90e28..0df8d7a 100644 --- a/tests/test_split_genomes.py +++ b/tests/test_split_genomes.py @@ -68,7 +68,7 @@ def read_and_validate_run_data_manager_yaml(path): def split_options_for(tmp_path: Path) -> BuildOptions: history_names = ["idc-hg19_rCRS_pUC18_phiX174-data_manager_star_index_builder"] - build_tracker = GalaxyHistoryBuildTracker(history_names) + build_tracker = GalaxyHistoryBuildTracker(history_names, "idc") split_options = BuildOptions() split_options.merged_genomes_path = tmp_path / "genomes.yml" From c03fb9caccedcf024763a8f369534ad202b98cef Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 5 Jul 2023 17:40:05 -0400 Subject: [PATCH 26/31] Lets actually setup an admin user and set ... --- tests/conftest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 003f4b2..7ee7a31 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -75,8 +75,7 @@ def start_container(**kwargs): raise Exception("Failed to wait on Galaxy to start.") gi = GalaxyInstance(container_url, key=key) if DOCKER_IMAGE != "bgruening": - user_dict = gi.users.create_local_user("admin", GALAXY_ADMIN_USER, GALAXY_ADMIN_PASSWORD) - print(user_dict) + gi.users.create_local_user("admin", GALAXY_ADMIN_USER, GALAXY_ADMIN_PASSWORD) gi = GalaxyInstance(container_url, email=GALAXY_ADMIN_USER,password=GALAXY_ADMIN_PASSWORD) yield GalaxyContainer( From 2db92d6aa5542fe9dced63ae3c258d90df3314d7 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 5 Jul 2023 17:52:07 -0400 Subject: [PATCH 27/31] Lets try this? --- tests/conftest.py | 12 +++++++----- tests/test_run_data_managers.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 7ee7a31..24f29b1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,7 +18,7 @@ GALAXY_ADMIN_USER = "admin@galaxy.org" GalaxyContainer = namedtuple( - "GalaxyContainer", ["url", "container", "attributes", "gi"] + "GalaxyContainer", ["url", "container", "attributes", "gi", "api_key"] ) @@ -75,10 +75,12 @@ def start_container(**kwargs): raise Exception("Failed to wait on Galaxy to start.") gi = GalaxyInstance(container_url, key=key) if DOCKER_IMAGE != "bgruening": - gi.users.create_local_user("admin", GALAXY_ADMIN_USER, GALAXY_ADMIN_PASSWORD) - gi = GalaxyInstance(container_url, email=GALAXY_ADMIN_USER,password=GALAXY_ADMIN_PASSWORD) - + user_dict = gi.users.create_local_user("admin", GALAXY_ADMIN_USER, GALAXY_ADMIN_PASSWORD) + api_key = gi.users.get_or_create_user_apikey(user_dict["id"]) + gi = GalaxyInstance(container_url, key=api_key) + else: + api_key = GALAXY_ADMIN_KEY yield GalaxyContainer( - url=container_url, container=container, attributes=container_attributes, gi=gi + url=container_url, container=container, attributes=container_attributes, gi=gi, api_key=api_key ) container.remove(force=True) diff --git a/tests/test_run_data_managers.py b/tests/test_run_data_managers.py index bf43b65..5b8e86b 100644 --- a/tests/test_run_data_managers.py +++ b/tests/test_run_data_managers.py @@ -57,7 +57,7 @@ def test_run_data_managers( ] ) else: - argv.extend(["-a", GALAXY_ADMIN_KEY]) + argv.extend(["-a", start_container.api_key]) argv.extend( ["-g", container.url, "--config", "tests/run_data_managers.yaml.test"] ) From b5e46f2a675ffa39e6cbb1ff3248b2ea4171e4fb Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 5 Jul 2023 18:08:24 -0400 Subject: [PATCH 28/31] ... --- tests/conftest.py | 1 + tests/test_run_data_managers.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 24f29b1..e7e2844 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -45,6 +45,7 @@ def start_container(**kwargs): environment = kwargs["environment"] environment["GALAXY_CONFIG_OVERRIDE_BOOTSTRAP_ADMIN_API_KEY"] = GALAXY_ADMIN_KEY environment["GALAXY_CONFIG_OVERRIDE_ADMIN_USERS"] = GALAXY_ADMIN_USER + environment["GALAXY_CONFIG_OVERRIDE_CONDA_AUTO_INIT"] = "true" container = client.containers.run( GALAXY_IMAGE, detach=True, ports={f"{GALAXY_PORT}/tcp": None}, **kwargs diff --git a/tests/test_run_data_managers.py b/tests/test_run_data_managers.py index 5b8e86b..6d40905 100644 --- a/tests/test_run_data_managers.py +++ b/tests/test_run_data_managers.py @@ -13,7 +13,6 @@ from ephemeris.shed_tools import InstallRepositoryManager from ephemeris.sleep import galaxy_wait from .conftest import ( - GALAXY_ADMIN_KEY, GALAXY_ADMIN_PASSWORD, GALAXY_ADMIN_USER, ) From e28eea49e6bdb6341096fca015e4eaddc707c149 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 5 Jul 2023 18:32:31 -0400 Subject: [PATCH 29/31] Debugging... --- tests/test_run_data_managers.py | 7 +++++-- tox.ini | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_run_data_managers.py b/tests/test_run_data_managers.py index 6d40905..f9725e6 100644 --- a/tests/test_run_data_managers.py +++ b/tests/test_run_data_managers.py @@ -24,7 +24,7 @@ class TestRunDataManagers(object): """This class tests run-data-managers""" def test_install_data_managers( - self, start_container + self, start_container, caplog ): """Install the data_managers on galaxy""" container = start_container @@ -34,7 +34,10 @@ def test_install_data_managers( dict(name="data_manager_bwa_mem_index_builder", owner="devteam"), ] irm = InstallRepositoryManager(container.gi) - irm.install_repositories(data_managers) + results = irm.install_repositories(data_managers) + import logging + caplog.set_level(logging.INFO) + assert len(results.errored_repositories) == 0, str(results) # Galaxy is restarted because otherwise data tables are not watched. container.container.exec_run("supervisorctl restart galaxy:") time.sleep(10) # give time for the services to go down diff --git a/tox.ini b/tox.ini index 04f0505..987457a 100644 --- a/tox.ini +++ b/tox.ini @@ -27,7 +27,7 @@ allowlist_externals = sed bash commands = - pytest -v --cov={envsitepackagesdir}/ephemeris --cov-report xml {[tox]test_dir} + pytest -v --cov={envsitepackagesdir}/ephemeris --cov-report xml tests/test_run_data_managers.py::TestRunDataManagers::test_install_data_managers tests/test_run_data_managers.py::TestRunDataManagers::test_run_data_managers # Replace the installed package directory by the source directory. # This is needed for codacy to understand which files have coverage testing # Unfortunately this has to run in the tox env to have access to envsitepackagesdir From 1e57e2d9513171582d3dc6d2809e6c5fbec256c5 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 5 Jul 2023 18:57:36 -0400 Subject: [PATCH 30/31] ... --- tests/test_run_data_managers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_run_data_managers.py b/tests/test_run_data_managers.py index f9725e6..7657ea6 100644 --- a/tests/test_run_data_managers.py +++ b/tests/test_run_data_managers.py @@ -64,7 +64,11 @@ def test_run_data_managers( ["-g", container.url, "--config", "tests/run_data_managers.yaml.test"] ) sys.argv = argv - run_data_managers.main() + try: + run_data_managers.main() + except RuntimeError: + print(container.container.logs()) + raise def test_run_data_managers_installation_skipped( self, start_container From 0308e7761fb8127f213613923446fee0da71d6af Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 5 Jul 2023 19:20:51 -0400 Subject: [PATCH 31/31] Umm? --- tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/conftest.py b/tests/conftest.py index e7e2844..b4f3a5f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,6 +46,7 @@ def start_container(**kwargs): environment["GALAXY_CONFIG_OVERRIDE_BOOTSTRAP_ADMIN_API_KEY"] = GALAXY_ADMIN_KEY environment["GALAXY_CONFIG_OVERRIDE_ADMIN_USERS"] = GALAXY_ADMIN_USER environment["GALAXY_CONFIG_OVERRIDE_CONDA_AUTO_INIT"] = "true" + environment["GALAXY_CONFIG_OVERRIDE_CONDA_PREFIX"] = "/galaxy/server/database/dependencies/_conda2" container = client.containers.run( GALAXY_IMAGE, detach=True, ports={f"{GALAXY_PORT}/tcp": None}, **kwargs