diff --git a/setup.py b/setup.py index 1950610..d29e0e0 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,9 @@ def get_var(var_name): install_tool_deps=ephemeris.install_tool_deps:main install-tool-deps=ephemeris.install_tool_deps:main set-library-permissions=ephemeris.set_library_permissions:main + _idc-lint=ephemeris._idc_lint:main + _idc-split-data-manager-genomes=ephemeris._idc_split_data_manager_genomes:main + _idc-data-managers-to-tools=ephemeris._idc_data_managers_to_tools:main ''' PACKAGE_DATA = { # Be sure to update MANIFEST.in for source dist. diff --git a/src/ephemeris/__init__.py b/src/ephemeris/__init__.py index 5191018..11d33ac 100644 --- a/src/ephemeris/__init__.py +++ b/src/ephemeris/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os import yaml from bioblend import galaxy @@ -16,6 +17,14 @@ ) +def get_or_create_history(history_name: str, gi: galaxy.GalaxyInstance): + histories = gi.histories.get_histories(name=history_name) + if histories: + return histories[0] + else: + return gi.histories.create_history(name=history_name) + + def check_url(url, log=None): if not url.startswith("http"): if log: @@ -39,7 +48,7 @@ def get_galaxy_connection(args, file=None, log=None, login_required=True): url = args.galaxy or file_content.get("galaxy_instance") galaxy_url = check_url(url, log) - api_key = args.api_key or file_content.get("api_key") + api_key = args.api_key or file_content.get("api_key") or os.environ.get("EPHEMERIS_API_KEY") if args.user and args.password: return galaxy.GalaxyInstance( diff --git a/src/ephemeris/_config_models.py b/src/ephemeris/_config_models.py new file mode 100644 index 0000000..ce74804 --- /dev/null +++ b/src/ephemeris/_config_models.py @@ -0,0 +1,83 @@ +from pathlib import Path +from typing import ( + Dict, + List, + Optional, + Union, +) + +import yaml +from pydantic import ( + BaseModel, + Extra, +) + + +StrOrPath = Union[Path, str] + + +class RepositoryInstallTarget(BaseModel): + name: str + owner: str + tool_shed_url: Optional[str] + tool_panel_section_id: Optional[str] + tool_panel_section_label: Optional[str] + revisions: Optional[List[str]] + install_tool_dependencies: Optional[bool] + install_repository_dependencies: Optional[bool] + install_resolver_dependencies: Optional[bool] + + +class RepositoryInstallTargets(BaseModel): + """ """ + api_key: Optional[str] + galaxy_instance: Optional[str] + tools: List[RepositoryInstallTarget] + + +class DataManager(BaseModel, extra=Extra.forbid): + tags: List[str] + tool_id: str + + +class DataManagers(BaseModel, extra=Extra.forbid): + __root__: Dict[str, DataManager] + + +class Genome(BaseModel): + id: str # The unique id of the data in Galaxy + description: str # The description of the data, including its taxonomy, version and date + dbkey: Optional[str] + source: Optional[str] # The source of the data. Can be: 'ucsc', an NCBI accession number or a URL to a fasta file. + + # The following fields are currently purely for human consumption and unused by + # IDC infrastructure. + doi: Optional[str] # Any DOI associated with the data + blob: Optional[str] # A blob for any other pertinent information + checksum: Optional[str] # A SHA256 checksum of the original + version: Optional[str] # Any version information associated with the data + + # Description of actions (data managers) to run on target genome. + indexers: Optional[List[str]] # indexers to run - keyed on repository name - see data_managers.yml for how to resolve these to tools + skiplist: Optional[List[str]] # unimplemented: but if we implement classes of indexers, these will be ones to skip + + +class Genomes(BaseModel): + genomes: List[Genome] + + +def _read_yaml(path: StrOrPath): + with open(path, "r") as f: + return yaml.safe_load(f) + + +def read_data_managers(path: StrOrPath) -> DataManagers: + return DataManagers(__root__=_read_yaml(path)) + + +def read_genomes(path: StrOrPath) -> Genomes: + return Genomes(**_read_yaml(path)) + + +def read_tools(path: StrOrPath) -> RepositoryInstallTargets: + return RepositoryInstallTargets(**_read_yaml(path)) diff --git a/src/ephemeris/_idc_build.py b/src/ephemeris/_idc_build.py new file mode 100644 index 0000000..b2e0b8d --- /dev/null +++ b/src/ephemeris/_idc_build.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python +"""Build script that takes an IDC-style configuration and builds relevant data. + +The IDC splits things out and manages runs in stages in order to deal with CVMFS, +but for most deployments one should be able to just run the data managers directly +and have the data installed or fetch data bundles. This will be that script and +will reuse a lot of what is in run_data_managers and will work with a version of +the data bundle fetching adapted for ephemeris. +""" +import logging +import os +import re +from copy import deepcopy +from typing import ( + Any, + Dict, + List, + Optional, +) + +import yaml +from galaxy.util import safe_makedirs +from pydantic import ( + BaseModel, + Extra, +) + +from . import get_galaxy_connection +from ._idc_data_managers_to_tools import ( + DataManager, + read_data_managers_configuration, +) +from .common_parser import ( + get_common_args, +) +from .ephemeris_log import ( + disable_external_library_logging, + setup_global_logger, +) + +TASK_FILE_NAME = "run_data_managers.yaml" +DEFAULT_TOOL_ID_MODE = "tool_shed_guid" +DEFAULT_BUILD_TRACKING_MODE = "galaxy_history" +DEFAULT_HISTORY_BUILD_PREFIX = "idc" + +log = logging.getLogger(__name__) + + +class BuildTracker: + + def is_build_complete(self, build_id: str, indexer_name: str) -> bool: + pass + + +class Filters: + stage: Optional[int] = None + data_manager: Optional[str] = None + build_id: Optional[str] = None + + def filter_out_data_manager(self, data_manager: str) -> bool: + return bool(self.data_manager and data_manager != self.data_manager) + + def filter_out_build_id(self, build_id: str) -> bool: + return bool(self.build_id and build_id != self.build_id) + + def filter_out_stage(self, stage: int) -> bool: + return bool(self.stage is not None and self.stage != stage) + + +class BuildOptions: + merged_genomes_path: str + split_genomes_path: str + data_managers_path: str + build_tracker: BuildTracker + tool_id_mode: str = DEFAULT_TOOL_ID_MODE + filters: Filters = Filters() + + +def tool_id_for(indexer: str, data_managers: Dict[str, DataManager], mode: str) -> str: + data_manager = data_managers[indexer] + assert data_manager, f"Could not find a target data manager for indexer name {indexer}" + tool_shed_guid = data_manager.tool_id + if mode == "short": + _ts, _, _owner, _repo_name, rest = tool_shed_guid.split("/", 4) + if "/" in rest: + print(rest) + return rest.split("/")[0] + else: + return rest + else: + return tool_shed_guid + + +class RunDataManager(BaseModel): + id: str + items: Optional[List[Any]] = None + params: Optional[List[Any]] = None + data_table_reload: Optional[List[str]] = None + + +class RunDataManagers(BaseModel): + data_managers: List[RunDataManager] + + +class DataManager(BaseModel, extra=Extra.forbid): + tags: List[str] + tool_id: str + + +class DataManagers(BaseModel, extra=Extra.forbid): + __root__: Dict[str, DataManager] + + +class Genome(BaseModel): + pass + + +class Genomes(BaseModel): + genomes: List[Genome] + + +def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str): + parent, _ = os.path.split(path) + if not os.path.exists(parent): + safe_makedirs(parent) + run_data_managers = RunDataManagers(data_managers=[run_data_manager]) + with open(path, "w") as of: + yaml.safe_dump(run_data_managers.dict(exclude_unset=True), of) + + +def walk_over_incomplete_runs(build_options: BuildOptions): + data_managers = read_data_managers_configuration(build_options.data_managers_path) + with open(build_options.merged_genomes_path) as f: + genomes_all = yaml.safe_load(f) + genomes = genomes_all["genomes"] + for genome in genomes: + build_id = genome["id"] + if build_options.filters.filter_out_build_id(build_id): + continue + + fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta" + do_fetch = not build_options.filters.filter_out_data_manager(fetch_indexer) + source = genome.get("source") + if source is None: + do_fetch = False + if do_fetch and build_options.filters.filter_out_stage(0): + do_fetch = False + + if do_fetch and not build_options.build_tracker.is_build_complete(build_id, fetch_indexer): + log.info(f"Fetching: {build_id}") + fetch_tool_id = tool_id_for(fetch_indexer, data_managers, build_options.tool_id_mode) + fetch_params = [] + fetch_params.append({"dbkey_source|dbkey": genome["id"]}) + source = genome.get("source") + if source == "ucsc": + fetch_params.append({"reference_source|reference_source_selector": "ucsc"}) + fetch_params.append({"reference_source|requested_dbkey": genome["id"]}) + fetch_params.append({"sequence_name": genome["description"]}) + elif re.match("^[A-Z_]+[0-9.]+", source): + fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) + fetch_params.append({"reference_source|reference_source_selector": "ncbi"}) + fetch_params.append( + {"reference_source|requested_identifier": source} + ) + fetch_params.append({"sequence_name": genome["description"]}) + fetch_params.append({"sequence.id": genome["id"]}) + elif re.match("^http", source): + fetch_params.append({"dbkey_source|dbkey_source_selector": "new"}) + fetch_params.append({"reference_source|reference_source_selector": "url"}) + fetch_params.append({"reference_source|user_url": source}) + fetch_params.append({"sequence_name": genome["description"]}) + fetch_params.append({"sequence.id": genome["id"]}) + + fetch_run_data_manager = RunDataManager( + id=fetch_tool_id, + params=fetch_params, + # Not needed according to Marius + # data_table_reload=["all_fasta", "__dbkeys__"], + ) + yield (build_id, fetch_indexer, fetch_run_data_manager) + else: + log.debug(f"Fetch is already completed: {build_id}") + + indexers = genome.get("indexers", []) + for indexer in indexers: + if build_options.filters.filter_out_data_manager(indexer): + continue + + if build_options.filters.filter_out_stage(1): + continue + + if build_options.build_tracker.is_build_complete(build_id, indexer): + log.debug(f"Build is already completed: {build_id} {indexer}") + continue + + log.info(f"Building: {build_id} {indexer}") + + tool_id = tool_id_for(indexer, data_managers, build_options.tool_id_mode) + params = [ + {"all_fasta_source": "{{ item.id }}"}, + {"sequence_name": "{{ item.name }}"}, + {"sequence_id": "{{ item.id }}"}, + ] + # why is this not pulled from the data managers conf? -nate + if re.search("bwa", tool_id): + params.append({"index_algorithm": "bwtsw"}) + if re.search("color_space", tool_id): + continue + + item = deepcopy(genome) + item.pop("indexers", None) + item.pop("skiplist", None) + + run_data_manager = RunDataManager( + id=tool_id, + params=params, + items=[item], + ) + yield (build_id, indexer, run_data_manager) + + +class GalaxyHistoryBuildTracker(BuildTracker): + + def __init__(self, history_names: List[str], history_build_prefix: str): + self._history_names = history_names + self._history_build_prefix = history_build_prefix + + def is_build_complete(self, build_id: str, indexer_name: str) -> bool: + target_history_name = f"{self._history_build_prefix}-{build_id}-{indexer_name}" + return target_history_name in self._history_names + + +class DirectoryBuildTracker(BuildTracker): + + def __init__(self, split_genomes_path: str): + self._split_genomes_path = split_genomes_path + + def is_build_complete(self, build_id: str, indexer_name: str) -> bool: + target_directory = os.path.join(self._split_genomes_path, build_id, indexer_name) + if not os.path.exists(target_directory): + return False + bundle_path = os.path.join(target_directory, "data_bundle.tgz") + if not os.path.exists(bundle_path): + return False + + +def _parser(): + """returns the parser object.""" + # login required to check history... + parser = get_common_args(login_required=True, log_file=True) + parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml") + parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks") + parser.add_argument('--data-managers-path', default="data_managers.yml") + + parser.add_argument("--tool-id-mode", choices=["tool_shed_guid", "short"], default=DEFAULT_TOOL_ID_MODE) + parser.add_argument("--build-tracking", choices=["galaxy_history", "directory"], default=DEFAULT_BUILD_TRACKING_MODE) + parser.add_argument("--history-build-prefix", default=DEFAULT_HISTORY_BUILD_PREFIX) + + # filters + parser.add_argument('--filter-stage', default=None) + parser.add_argument('--filter-data-manager', default=None) + parser.add_argument('--filter-build-id', default=None) + + return parser + + +def get_galaxy_history_names(args) -> List[str]: + gi = get_galaxy_connection(args, login_required=True) + return [h["name"] for h in gi.histories.get_histories()] + + +def get_build_tracker(args): + if args.build_tracking == "galaxy_history": + build_tracker = GalaxyHistoryBuildTracker(get_galaxy_history_names(args), args.history_build_prefix) + else: + build_tracker = DirectoryBuildTracker(args.split_genomes_path) + return build_tracker + + +def configure_python_for_build() -> BuildOptions: + disable_external_library_logging() + parser = _parser() + args = parser.parse_args() + log = setup_global_logger(name=__name__, log_file=args.log_file) + if args.verbose: + log.setLevel(logging.DEBUG) + else: + log.setLevel(logging.INFO) + + build_options = BuildOptions() + build_options.data_managers_path = args.data_managers_path + build_options.merged_genomes_path = args.merged_genomes_path + build_options.split_genomes_path = args.split_genomes_path + build_options.build_tracker = get_build_tracker(args) + build_options.tool_id_mode = args.tool_id_mode + + filters = Filters() + filters.build_id = args.filter_build_id + filters.data_manager = args.filter_data_manager + filters.stage = args.filter_stage + build_options.filters = filters + + +def main(): + build_options = configure_python_for_build() + # TODO: build request genomes... + print(build_options) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/_idc_data_managers_to_tools.py b/src/ephemeris/_idc_data_managers_to_tools.py new file mode 100644 index 0000000..20f7b68 --- /dev/null +++ b/src/ephemeris/_idc_data_managers_to_tools.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +"""Helper script for IDC - not yet meant for public consumption. + +This script takes a data_managers.yml configuration describing the +set of data managers the IDC configuration targets and builds a +a tools.yml file from it for use with shed_tools. +""" +import argparse +import logging +from typing import ( + Dict, + List, + NamedTuple, +) + +import yaml + +from ._config_models import ( + read_data_managers, + RepositoryInstallTargets, +) +from .common_parser import ( + add_log_file_argument, + add_verbosity_argument, +) +from .ephemeris_log import ( + disable_external_library_logging, + setup_global_logger, +) + + +class DataManager(NamedTuple): + tool_id: str + repository_name: str + tags: List[str] + + +def read_data_managers_configuration(path: str) -> Dict[str, DataManager]: + raw_data_managers = read_data_managers(path) + data_managers: Dict[str, DataManager] = {} + for repository_name, data_manager_configuration in raw_data_managers.__root__.items(): + data_manager = DataManager( + tool_id=data_manager_configuration.tool_id, + repository_name=repository_name, + tags=data_manager_configuration.tags or [], + ) + data_managers[repository_name] = data_manager + return data_managers + + +def build_shed_install_conf(path: str) -> dict: + data_managers = read_data_managers_configuration(path) + tools = [] + for data_manager in data_managers.values(): + tool_id = data_manager.tool_id + tool_id_parts = tool_id.split("/") + repo_owner = tool_id_parts[2] + repo_name = tool_id_parts[3] + entry = { + "name": repo_name, + "owner": repo_owner, + "tool_panel_section_label": None, + "tool_shed_url": "toolshed.g2.bx.psu.edu", + } + tools.append(entry) + tools_yaml = {"tools": tools} + return tools_yaml + + +def write_shed_install_conf(data_manager_conf_path: str, output_path: str) -> None: + tools_yaml = build_shed_install_conf(data_manager_conf_path) + + # validate generated dict to ensure we're writing out valid file + RepositoryInstallTargets(**tools_yaml) + + with open(output_path, "w") as f: + yaml.safe_dump(tools_yaml, f) + + +def _parser(): + """returns the parser object.""" + + parser = argparse.ArgumentParser(add_help=False) + general_group = parser.add_argument_group("General options") + add_verbosity_argument(general_group) + add_log_file_argument(general_group) + parser.add_argument('--data-managers-conf', default="data_managers.yml") + parser.add_argument('--shed-install-output-conf', default="tools.yml") + return parser + + +def main(): + disable_external_library_logging() + parser = _parser() + args = parser.parse_args() + log = setup_global_logger(name=__name__, log_file=args.log_file) + if args.verbose: + log.setLevel(logging.DEBUG) + else: + log.setLevel(logging.INFO) + write_shed_install_conf(args.data_managers_conf, args.shed_install_output_conf) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/_idc_lint.py b/src/ephemeris/_idc_lint.py new file mode 100644 index 0000000..e8949ea --- /dev/null +++ b/src/ephemeris/_idc_lint.py @@ -0,0 +1,42 @@ +import os +from pathlib import Path + +import yaml + +from ._config_models import ( + read_data_managers, + read_genomes, +) + + +def read_yaml(path: Path): + with open(path, "r") as f: + return yaml.safe_load(f) + + +def lint_idc_directory(directory: Path): + genomes_path = directory / "genomes.yml" + data_managers_path = directory / "data_managers.yml" + assert genomes_path.exists() + assert data_managers_path.exists() + data_managers = read_data_managers(data_managers_path).__root__ + genomes = read_genomes(genomes_path) + + for data_manager in data_managers.values(): + data_manager_tool_id = data_manager.tool_id + if not data_manager_tool_id.startswith("toolshed.g2.bx.psu.edu/"): + raise Exception(f"Expected a data manager repository from main Galaxy tool shed but discovered tool ID {data_manager_tool_id}") + + for genome in genomes.genomes: + print(genome) + for indexer in (genome.indexers or []): + if indexer not in data_managers: + raise Exception(f"Failed to find data manager {indexer} referenced for genome {genome}") + + +def main(): + lint_idc_directory(Path(os.curdir)) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/_idc_split_data_manager_genomes.py b/src/ephemeris/_idc_split_data_manager_genomes.py new file mode 100644 index 0000000..87e0456 --- /dev/null +++ b/src/ephemeris/_idc_split_data_manager_genomes.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +"""Helper script for IDC - not yet meant for public consumption. + +This script splits genomes.yml into tasks that are meant to be sent to +run_data_managers.py - while excluding data managers executions specified +by genomes.yml that have already been executed and appear in the target +installed data table configuration. +""" +import os + +from galaxy.util import safe_makedirs + +from ._idc_build import ( + BuildOptions, + configure_python_for_build, + RunDataManager, + TASK_FILE_NAME, + walk_over_incomplete_runs, + write_run_data_manager_to_file, +) + + +def split_genomes(build_options: BuildOptions) -> None: + + def write_task_file(build_id: str, indexer: str, run_data_manager: RunDataManager): + split_genomes_path = build_options.split_genomes_path + if not os.path.exists(build_options.split_genomes_path): + safe_makedirs(split_genomes_path) + + task_file_dir = os.path.join(split_genomes_path, build_id, indexer) + task_file = os.path.join(task_file_dir, TASK_FILE_NAME) + write_run_data_manager_to_file(run_data_manager, task_file) + + for build_id, indexer, run_data_manager in walk_over_incomplete_runs(build_options): + write_task_file(build_id, indexer, run_data_manager) + + +def main(): + build_options = configure_python_for_build() + split_genomes(build_options) + + +if __name__ == "__main__": + main() diff --git a/src/ephemeris/common_parser.py b/src/ephemeris/common_parser.py index 6992000..30924bf 100644 --- a/src/ephemeris/common_parser.py +++ b/src/ephemeris/common_parser.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import argparse +import os class HideUnderscoresHelpFormatter(argparse.HelpFormatter): @@ -18,28 +19,37 @@ class ArgumentDefaultsHideUnderscoresHelpFormatter(HideUnderscoresHelpFormatter, pass +def add_verbosity_argument(parser_or_group): + parser_or_group.add_argument( + "-v", "--verbose", help="Increase output verbosity.", action="store_true" + ) + + +def add_log_file_argument(parser_or_group): + parser_or_group.add_argument( + "--log-file", + "--log_file", + dest="log_file", + help="Where the log file should be stored. " + "Default is a file in your system's temp folder", + default=None, + ) + + def get_common_args(login_required=True, log_file=False): parser = argparse.ArgumentParser(add_help=False) general_group = parser.add_argument_group("General options") - general_group.add_argument( - "-v", "--verbose", help="Increase output verbosity.", action="store_true" - ) + add_verbosity_argument(general_group) if log_file: - general_group.add_argument( - "--log-file", - "--log_file", - dest="log_file", - help="Where the log file should be stored. " - "Default is a file in your system's temp folder", - default=None, - ) + add_log_file_argument(general_group) con_group = parser.add_argument_group("Galaxy connection") + default_galaxy = os.environ.get("EPHEMERIS_GALAXY") or "http://localhost:8080" con_group.add_argument( "-g", "--galaxy", help="Target Galaxy instance URL/IP address", - default="http://localhost:8080", + default=default_galaxy, ) if login_required: diff --git a/src/ephemeris/run_data_managers.py b/src/ephemeris/run_data_managers.py index c07feb0..ba98906 100644 --- a/src/ephemeris/run_data_managers.py +++ b/src/ephemeris/run_data_managers.py @@ -28,13 +28,17 @@ import logging import time from collections import namedtuple +from typing import Optional +from bioblend.galaxy import GalaxyInstance from bioblend.galaxy.tool_data import ToolDataClient from bioblend.galaxy.tools import ToolClient from jinja2 import Template +from typing_extensions import Literal from . import ( get_galaxy_connection, + get_or_create_history, load_yaml_file, ) from .common_parser import ( @@ -48,6 +52,7 @@ DEFAULT_URL = "http://localhost" DEFAULT_SOURCE_TABLES = ["all_fasta"] +DATA_MANAGER_MODES = Literal["dry_run", "populate", "bundle"] def wait(gi, job_list, log): @@ -107,7 +112,7 @@ def get_first_valid_entry(input_dict, key_list): class DataManagers: - def __init__(self, galaxy_instance, configuration): + def __init__(self, galaxy_instance: GalaxyInstance, configuration): """ :param galaxy_instance: A GalaxyInstance object (import from bioblend.galaxy) :param configuration: A dictionary. Examples in the ephemeris documentation. @@ -154,8 +159,8 @@ def get_dm_jobs(self, dm): :returns job_list, skipped_job_list""" job_list = [] skipped_job_list = [] - items = self.parse_items(dm.get("items", [""])) - for item in items: + + def handle_item(item: str): dm_id = dm["id"] params = dm["params"] inputs = dict() @@ -169,11 +174,20 @@ def get_dm_jobs(self, dm): job = dict(tool_id=dm_id, inputs=inputs) - data_tables = dm.get("data_table_reload", []) + data_tables = dm.get("data_table_reload") or [] if self.input_entries_exist_in_data_tables(data_tables, inputs): skipped_job_list.append(job) else: job_list.append(job) + + raw_items = dm.get("items") or None + if raw_items: + items = self.parse_items(raw_items) + for item in items: + handle_item(item) + else: + handle_item("") + return job_list, skipped_job_list def dm_is_fetcher(self, dm): @@ -208,6 +222,11 @@ def data_table_entry_exists(self, data_table_name, entry, column="value"): def input_entries_exist_in_data_tables(self, data_tables, input_dict): """Checks whether name and value entries from the input are already present in the data tables. If an entry is missing in of the tables, this function returns False""" + if data_tables is None or len(data_tables) == 0: + # this logic is all broken I (@jmchilton) think, but lets just skip it all + # if we know we don't have data tables to check + return False + value_entry = get_first_valid_entry(input_dict, self.possible_value_keys) name_entry = get_first_valid_entry(input_dict, self.possible_name_keys) @@ -245,7 +264,14 @@ def parse_items(self, items): items = json.loads(rendered_items) return items - def run(self, log=None, ignore_errors=False, overwrite=False): + def run( + self, + log=None, + ignore_errors=False, + overwrite=False, + data_manager_mode: DATA_MANAGER_MODES = "populate", + history_name: Optional[str] = None, + ): """ Runs the data managers. :param log: The log to be used. @@ -260,6 +286,10 @@ def run(self, log=None, ignore_errors=False, overwrite=False): if not log: log = logging.getLogger() + history_id: Optional[str] = None + if history_name is not None: + history_id = get_or_create_history(history_name, self.gi)["id"] + def run_jobs(jobs, skipped_jobs): job_list = [] for skipped_job in skipped_jobs: @@ -277,7 +307,7 @@ def run_jobs(jobs, skipped_jobs): all_skipped_jobs.append(skipped_job) for job in jobs: started_job = self.tool_client.run_tool( - history_id=None, tool_id=job["tool_id"], tool_inputs=job["inputs"] + history_id=history_id, tool_id=job["tool_id"], tool_inputs=job["inputs"], data_manager_mode=data_manager_mode ) log.info( 'Dispatched job %i. Running DM: "%s" with parameters: %s' @@ -343,6 +373,8 @@ def _parser(): action="store_true", help="Do not stop running when jobs have failed.", ) + parser.add_argument("--data-manager-mode", "--data_manager_mode", choices=["bundle", "populate", "dry_run"], default="populate") + parser.add_argument("--history-name", default=None) return parser @@ -358,7 +390,7 @@ def main(): gi = get_galaxy_connection(args, file=args.config, log=log, login_required=True) config = load_yaml_file(args.config) data_managers = DataManagers(gi, config) - data_managers.run(log, args.ignore_errors, args.overwrite) + data_managers.run(log, args.ignore_errors, args.overwrite, data_manager_mode=args.data_manager_mode, history_name=args.history_name) if __name__ == "__main__": diff --git a/src/ephemeris/shed_tools.py b/src/ephemeris/shed_tools.py index 35e2553..0f17bd0 100644 --- a/src/ephemeris/shed_tools.py +++ b/src/ephemeris/shed_tools.py @@ -35,6 +35,7 @@ """ import datetime as dt import json +import logging import os import re import time @@ -94,6 +95,8 @@ "Loading proprietary datatypes", } +log = logging.getLogger(__name__) + class InstallRepoDict(TypedDict): name: str @@ -166,7 +169,7 @@ def filter_installed_repos( def install_repositories( self, repositories: List[InstallRepoDict], - log=None, + log=log, force_latest_revision: bool = False, default_toolshed: str = "https://toolshed.g2.bx.psu.edu/", default_install_tool_dependencies: bool = False, @@ -284,7 +287,7 @@ def install_repositories( errored_repositories=errored_repositories, ) - def update_repositories(self, repositories=None, log=None, **kwargs): + def update_repositories(self, repositories=None, log=log, **kwargs): if not repositories: # Repositories None or empty list repositories = self.installed_repositories() else: @@ -307,7 +310,7 @@ def test_tools( self, test_json, repositories=None, - log=None, + log=log, test_user_api_key=None, test_user="ephemeris@galaxyproject.org", test_history_name=None, @@ -588,7 +591,7 @@ def install_repository_revision(self, repository: InstallRepoDict, log): ) return "error" - def wait_for_install(self, repository, log=None, timeout=3600): + def wait_for_install(self, repository, log=log, timeout=3600): """ If nginx times out, we look into the list of installed repositories and try to determine if a repository of the same namer/owner is still installing. diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py index 56a89cc..b4f3a5f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,15 +8,17 @@ # It needs to work well with dev. Alternatively we can pin this to 'master' or another stable branch. # Preferably a branch that updates with each stable release -GALAXY_IMAGE = "bgruening/galaxy-stable:20.05" +DOCKER_IMAGE = "galaxy-min" # or bgruening +GALAXY_IMAGE_MIN = "galaxy/galaxy-min:23.1-auto" +GALAXY_IMAGE_BGRUENING = "bgruening/galaxy-stable:20.05" +GALAXY_IMAGE = GALAXY_IMAGE_MIN if DOCKER_IMAGE == "galaxy-min" else GALAXY_IMAGE_BGRUENING +GALAXY_PORT = "80" if DOCKER_IMAGE == "bgruening" else "8080" GALAXY_ADMIN_KEY = "fakekey" GALAXY_ADMIN_PASSWORD = "password" GALAXY_ADMIN_USER = "admin@galaxy.org" -client = docker.from_env() - GalaxyContainer = namedtuple( - "GalaxyContainer", ["url", "container", "attributes", "gi"] + "GalaxyContainer", ["url", "container", "attributes", "gi", "api_key"] ) @@ -30,10 +32,24 @@ def start_container(**kwargs): # and use the internal ip address instead. # But alas, the trappings of a proprietary BSD kernel compel us to do ugly workarounds. key = kwargs.get("api_key", GALAXY_ADMIN_KEY) - ensure_admin = kwargs.get("ensure_admin", True) + + # Set this to False as we try to get bootstrap working with galaxy-min. + # ensure_admin = kwargs.get("ensure_admin", True) + ensure_admin = False + + client = docker.from_env() + + if DOCKER_IMAGE != "bgruening": + if "environment" not in kwargs: + kwargs["environment"] = {} + environment = kwargs["environment"] + environment["GALAXY_CONFIG_OVERRIDE_BOOTSTRAP_ADMIN_API_KEY"] = GALAXY_ADMIN_KEY + environment["GALAXY_CONFIG_OVERRIDE_ADMIN_USERS"] = GALAXY_ADMIN_USER + environment["GALAXY_CONFIG_OVERRIDE_CONDA_AUTO_INIT"] = "true" + environment["GALAXY_CONFIG_OVERRIDE_CONDA_PREFIX"] = "/galaxy/server/database/dependencies/_conda2" container = client.containers.run( - GALAXY_IMAGE, detach=True, ports={"80/tcp": None}, **kwargs + GALAXY_IMAGE, detach=True, ports={f"{GALAXY_PORT}/tcp": None}, **kwargs ) container_id = container.attrs.get("Id") print(container_id) @@ -47,7 +63,7 @@ def start_container(**kwargs): exposed_port = ( container_attributes.get("NetworkSettings") .get("Ports") - .get("80/tcp")[0] + .get(f"{GALAXY_PORT}/tcp")[0] .get("HostPort") ) @@ -57,9 +73,16 @@ def start_container(**kwargs): container_url, timeout=180, api_key=key, ensure_admin=ensure_admin ) if not ready: + print(client.containers.get(container_id).logs()) raise Exception("Failed to wait on Galaxy to start.") gi = GalaxyInstance(container_url, key=key) + if DOCKER_IMAGE != "bgruening": + user_dict = gi.users.create_local_user("admin", GALAXY_ADMIN_USER, GALAXY_ADMIN_PASSWORD) + api_key = gi.users.get_or_create_user_apikey(user_dict["id"]) + gi = GalaxyInstance(container_url, key=api_key) + else: + api_key = GALAXY_ADMIN_KEY yield GalaxyContainer( - url=container_url, container=container, attributes=container_attributes, gi=gi + url=container_url, container=container, attributes=container_attributes, gi=gi, api_key=api_key ) container.remove(force=True) diff --git a/tests/test_idc_data_managers_to_tools.py b/tests/test_idc_data_managers_to_tools.py new file mode 100644 index 0000000..f00a647 --- /dev/null +++ b/tests/test_idc_data_managers_to_tools.py @@ -0,0 +1,13 @@ +from pathlib import Path + +from ephemeris._config_models import read_tools +from ephemeris._idc_data_managers_to_tools import write_shed_install_conf +from .test_split_genomes import setup_mock_idc_dir + + +def test_idc_lint_valid(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + output_path = tmp_path / "output.yaml" + write_shed_install_conf(tmp_path / "data_managers.yml", output_path) + # validate the generated tools file... + read_tools(output_path) diff --git a/tests/test_idc_lint.py b/tests/test_idc_lint.py new file mode 100644 index 0000000..3db0e74 --- /dev/null +++ b/tests/test_idc_lint.py @@ -0,0 +1,64 @@ +from pathlib import Path + +import pytest + +from ephemeris._idc_lint import lint_idc_directory +from .test_split_genomes import setup_mock_idc_dir + + +MISSPELLED_DATA_MANAGER_YAML_STR = """ +data_manager_fetch_genome_dbkeys_all_fasta: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta/data_manager_fetch_genome_all_fasta_dbkey/0.0.3' + tags: + - fetch_source +data_manager_two_bit_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2' + tags: + - genome +data_manager_picard_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_picard_index_builder/data_manager/picard_index_builder/0.0.1' + tags: + - genome +data_manager_star_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/iuc/data_manager_star_index_builder/rna_star_index_builder_data_manager/0.0.5' + tags: + - genome +""" + +TESTTOOLSHED_DATA_MANAGER_YAML_STR = """ +data_manager_fetch_genome_dbkeys_all_fasta: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta/data_manager_fetch_genome_all_fasta_dbkey/0.0.3' + tags: + - fetch_source +data_manager_twobit_builder: + tool_id: 'testtoolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2' + tags: + - genome +data_manager_picard_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_picard_index_builder/data_manager/picard_index_builder/0.0.1' + tags: + - genome +data_manager_star_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/iuc/data_manager_star_index_builder/rna_star_index_builder_data_manager/0.0.5' + tags: + - genome +""" + + +def test_idc_lint_valid(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + lint_idc_directory(tmp_path) + + +def test_idc_lint_misspelled_dm(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + (tmp_path / "data_managers.yml").write_text(MISSPELLED_DATA_MANAGER_YAML_STR) + with pytest.raises(Exception) as exc_info: + lint_idc_directory(tmp_path) + # misspelled two_bit in data managers so data_manager_twobit_builder is missing + assert "data_manager_twobit_builder" in str(exc_info.value) + + (tmp_path / "data_managers.yml").write_text(TESTTOOLSHED_DATA_MANAGER_YAML_STR) + with pytest.raises(Exception) as exc_info: + lint_idc_directory(tmp_path) + assert "testtoolshed" in str(exc_info.value) diff --git a/tests/test_run_data_managers.py b/tests/test_run_data_managers.py index ccee537..7657ea6 100644 --- a/tests/test_run_data_managers.py +++ b/tests/test_run_data_managers.py @@ -7,16 +7,15 @@ import pytest import yaml -from conftest import ( - GALAXY_ADMIN_KEY, - GALAXY_ADMIN_PASSWORD, - GALAXY_ADMIN_USER, -) from ephemeris import run_data_managers from ephemeris.run_data_managers import DataManagers from ephemeris.shed_tools import InstallRepositoryManager from ephemeris.sleep import galaxy_wait +from .conftest import ( + GALAXY_ADMIN_PASSWORD, + GALAXY_ADMIN_USER, +) AUTH_BY = "key" @@ -25,7 +24,7 @@ class TestRunDataManagers(object): """This class tests run-data-managers""" def test_install_data_managers( - self, start_container + self, start_container, caplog ): """Install the data_managers on galaxy""" container = start_container @@ -35,7 +34,10 @@ def test_install_data_managers( dict(name="data_manager_bwa_mem_index_builder", owner="devteam"), ] irm = InstallRepositoryManager(container.gi) - irm.install_repositories(data_managers) + results = irm.install_repositories(data_managers) + import logging + caplog.set_level(logging.INFO) + assert len(results.errored_repositories) == 0, str(results) # Galaxy is restarted because otherwise data tables are not watched. container.container.exec_run("supervisorctl restart galaxy:") time.sleep(10) # give time for the services to go down @@ -57,12 +59,16 @@ def test_run_data_managers( ] ) else: - argv.extend(["-a", GALAXY_ADMIN_KEY]) + argv.extend(["-a", start_container.api_key]) argv.extend( ["-g", container.url, "--config", "tests/run_data_managers.yaml.test"] ) sys.argv = argv - run_data_managers.main() + try: + run_data_managers.main() + except RuntimeError: + print(container.container.logs()) + raise def test_run_data_managers_installation_skipped( self, start_container diff --git a/tests/test_split_genomes.py b/tests/test_split_genomes.py new file mode 100644 index 0000000..0df8d7a --- /dev/null +++ b/tests/test_split_genomes.py @@ -0,0 +1,178 @@ +from pathlib import Path + +import yaml + +from ephemeris._idc_build import ( + BuildOptions, + Filters, + GalaxyHistoryBuildTracker, + RunDataManagers, +) +from ephemeris._idc_split_data_manager_genomes import ( + split_genomes, +) + + +MERGED_YAML_STR = """ +genomes: + - dbkey: hg19_rCRS_pUC18_phiX174 + description: Homo sapiens (hg19 with mtDNA replaced with rCRS, and containing pUC18 + and phiX174) + source: http://datacache.galaxyproject.org/managed/seq/hg19_rCRS_pUC18_phiX174.fa + id: hg19_rCRS_pUC18_phiX174 + indexers: + - data_manager_twobit_builder + - data_manager_star_index_builder + + - dbkey: rn6 + description: Rat Jul. 2014 (RGSC 6.0/rn6) (rn6) + id: rn6 + source: ucsc + indexers: + - data_manager_twobit_builder + - data_manager_picard_index_builder +""" + +DATA_MANAGER_YAML_STR = """ +data_manager_fetch_genome_dbkeys_all_fasta: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta/data_manager_fetch_genome_all_fasta_dbkey/0.0.3' + tags: + - fetch_source +data_manager_twobit_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2' + tags: + - genome +data_manager_picard_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/devteam/data_manager_picard_index_builder/data_manager/picard_index_builder/0.0.1' + tags: + - genome +data_manager_star_index_builder: + tool_id: 'toolshed.g2.bx.psu.edu/repos/iuc/data_manager_star_index_builder/rna_star_index_builder_data_manager/0.0.5' + tags: + - genome +""" + + +def setup_mock_idc_dir(directory: Path): + merged = directory / "genomes.yml" + merged.write_text(MERGED_YAML_STR) + + data_managers = directory / "data_managers.yml" + data_managers.write_text(DATA_MANAGER_YAML_STR) + + +def read_and_validate_run_data_manager_yaml(path): + with open(path, "r") as f: + return RunDataManagers(**yaml.safe_load(f)) + + +def split_options_for(tmp_path: Path) -> BuildOptions: + history_names = ["idc-hg19_rCRS_pUC18_phiX174-data_manager_star_index_builder"] + build_tracker = GalaxyHistoryBuildTracker(history_names, "idc") + + split_options = BuildOptions() + split_options.merged_genomes_path = tmp_path / "genomes.yml" + split_options.split_genomes_path = str(tmp_path / "split") + split_options.data_managers_path = tmp_path / "data_managers.yml" + split_options.build_tracker = build_tracker + return split_options + + +def test_split_genomes(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + split_genomes(split_options) + new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + complete_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_star_index_builder" + assert new_task.exists() + assert not complete_task.exists() + new_task_run_yaml = new_task / "run_data_managers.yaml" + # ensure we don't serialize unset fields + assert "data_table_reload" not in new_task_run_yaml.read_text() + run = read_and_validate_run_data_manager_yaml(new_task_run_yaml) + assert len(run.data_managers) == 1 + data_manager = run.data_managers[0] + assert data_manager.id == "toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2" + assert data_manager.items[0]["id"] == "hg19_rCRS_pUC18_phiX174" + assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174" + + +def test_split_genomes_short_ids(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + split_options.tool_id_mode = "short" + split_genomes(split_options) + + new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + new_task_run_yaml = new_task / "run_data_managers.yaml" + run = read_and_validate_run_data_manager_yaml(new_task_run_yaml) + assert len(run.data_managers) == 1 + data_manager = run.data_managers[0] + assert data_manager.id == "twobit_builder_data_manager" + + +def test_split_genomes_filter_on_data_manager(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.data_manager = "data_manager_star_index_builder" + split_options.filters = filters + + split_genomes(split_options) + new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert not new_task.exists() + + filters.data_manager = "data_manager_twobit_builder" + split_genomes(split_options) + assert new_task.exists() + + +def test_split_genomes_filter_on_build_id(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.build_id = "rn6" + split_options.filters = filters + + split_genomes(split_options) + filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert not filtered_out_task.exists() + + filtered_in_task = split_path / "rn6" / "data_manager_twobit_builder" + assert filtered_in_task.exists() + + +def test_split_genomes_filter_on_stage_0(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.stage = 0 + split_options.filters = filters + + split_genomes(split_options) + filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert not filtered_out_task.exists() + + filtered_in_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_fetch_genome_dbkeys_all_fasta" + assert filtered_in_task.exists() + + +def test_split_genomes_filter_on_stage_1(tmp_path: Path): + setup_mock_idc_dir(tmp_path) + split_path = tmp_path / "split" + split_options = split_options_for(tmp_path) + filters = Filters() + filters.stage = 1 + split_options.filters = filters + + split_genomes(split_options) + filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_fetch_genome_dbkeys_all_fasta" + assert not filtered_out_task.exists() + + filtered_in_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder" + assert filtered_in_task.exists() diff --git a/tox.ini b/tox.ini index 04f0505..987457a 100644 --- a/tox.ini +++ b/tox.ini @@ -27,7 +27,7 @@ allowlist_externals = sed bash commands = - pytest -v --cov={envsitepackagesdir}/ephemeris --cov-report xml {[tox]test_dir} + pytest -v --cov={envsitepackagesdir}/ephemeris --cov-report xml tests/test_run_data_managers.py::TestRunDataManagers::test_install_data_managers tests/test_run_data_managers.py::TestRunDataManagers::test_run_data_managers # Replace the installed package directory by the source directory. # This is needed for codacy to understand which files have coverage testing # Unfortunately this has to run in the tox env to have access to envsitepackagesdir