diff --git a/docs/source/api/etl_api.rst b/docs/source/api/etl_api.rst index 0d42cfb3..4410e3d9 100644 --- a/docs/source/api/etl_api.rst +++ b/docs/source/api/etl_api.rst @@ -3,13 +3,11 @@ Source ETL API ============== -Base ----- +Update methods +-------------- -.. autoclass:: gene.etl.base.Base - :members: - :special-members: __init__ - :undoc-members: +.. automodule:: gene.etl.update + :members: Exceptions ---------- diff --git a/docs/source/conf.py b/docs/source/conf.py index 46a04cce..2eeab8a3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -19,6 +19,7 @@ "sphinx_autodoc_typehints", "sphinx.ext.linkcode", "sphinx_copybutton", + "sphinx_click", ] templates_path = ["_templates"] @@ -77,9 +78,112 @@ def linkcode_resolve(domain, info): if not info["module"]: return None filename = info["module"].replace(".", "/") - return f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py" # noqa: E501 + return ( + f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py" # noqa: E501 + ) # -- code block style -------------------------------------------------------- pygments_style = "default" pygements_dark_style = "monokai" + +# -- sphinx-click ------------------------------------------------------------ +# These functions let us write descriptions/docstrings in a way that doesn't look +# weird in the Click CLI, but get additional formatting in the sphinx-click autodocs for +# better readability. +from typing import List +import re + +from click.core import Context +from sphinx.application import Sphinx +from sphinx_click.ext import _get_usage, _format_usage, _indent + +CMD_PATTERN = r"--[^ ]+" +STR_PATTERN = r"\"[^ ]+\"" +SNAKE_PATTERN = r"[A-Z]+_[A-Z_]*[A-Z]" + + +def _add_formatting_to_string(line: str) -> str: + """Add fixed-width code formatting to span sections in lines: + + * shell options, eg `--update_all` + * double-quoted strings, eg `"HGNC"` + * all caps SNAKE_CASE env vars, eg `GENE_NORM_REMOTE_DB_URL` + """ + for pattern in (CMD_PATTERN, STR_PATTERN, SNAKE_PATTERN): + line = re.sub(pattern, lambda x: f"``{x.group()}``", line) + return line + + +def process_description(app: Sphinx, ctx: Context, lines: List[str]): + """Add custom formatting to sphinx-click autodoc descriptions. + + * remove :param: :return: etc + * add fixed-width (code) font to certain words + * add code block formatting to example shell commands + * move primary usage example to the top of the description + + Because we have to modify the lines list in place, we have to make multiple passes + through it to format everything correctly. + """ + if not lines: + return + + # chop off params + param_boundary = None + for i, line in enumerate(lines): + if ":param" in line: + param_boundary = i + break + if param_boundary is not None: + del lines[param_boundary:] + lines[-1] = "" + + # add code formatting to strings, commands, and env vars + lines_to_fmt = [] + for i, line in enumerate(lines): + if line.startswith(" ") or line.startswith(">>> "): + continue # skip example code blocks + if any( + [ + re.findall(CMD_PATTERN, line), + re.findall(STR_PATTERN, line), + re.findall(SNAKE_PATTERN, line), + ] + ): + lines_to_fmt.append(i) + for line_num in lines_to_fmt: + lines[line_num] = _add_formatting_to_string(lines[line_num]) + + # add code block formatting to example console commands + for i in range(len(lines) - 1, -1, -1): + if lines[i].startswith(" "): + lines.insert(i + 2, "") + if i == 0 or not lines[i - 1].startswith(" "): + lines.insert(i, "") + lines.insert(i, ".. code-block:: console") + + # put usage at the top of the description + lines.insert(0, "") + for usage_line in _get_usage(ctx).splitlines()[::-1]: + lines.insert(0, _indent(usage_line)) + lines.insert(0, "") + lines.insert(0, ".. code-block:: shell") + + +def process_option(app: Sphinx, ctx: Context, lines: List[str]): + """Add fixed-width formatting to strings in sphinx-click autodoc options.""" + for i, line in enumerate(lines): + if re.findall(STR_PATTERN, line): + lines[i] = re.sub(STR_PATTERN, lambda x: f"``{x.group()}``", line) + + +def setup(app): + """Used to hook format customization into sphinx-click build. + + In particular, since we move usage to the top of the command description, we need + an extra hook here to silence the built-in usage section. + """ + app.connect("sphinx-click-process-description", process_description) + app.connect("sphinx-click-process-options", process_option) + app.connect("sphinx-click-process-usage", lambda app, ctx, lines: lines.clear()) diff --git a/docs/source/install.rst b/docs/source/install.rst index ce95c9ad..4f46e436 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -48,9 +48,9 @@ Set the environment variable ``GENE_NORM_DB_URL`` to a connection description fo Load data +++++++++ -Use the ``gene_norm_update_remote`` shell command to load data from the most recent remotely-stored data dump: :: +Use the ``gene-normalizer update-from-remote`` shell command to load data from the most recent remotely-stored data dump: :: - gene_norm_update_remote + gene-normalizer update-from-remote Start service +++++++++++++ @@ -145,7 +145,7 @@ Load data To load all source data, and then generate normalized records, use the following shell command: :: - gene_norm_update --update_all --update_merged + gene-normalizer update --all --normalize This will download the latest available versions of all source data files, extract and transform recognized gene concepts, load them into the database, and construct normalized concept groups. For more specific update commands, see :ref:`Loading and updating data `. diff --git a/docs/source/managing_data/loading_and_updating_data.rst b/docs/source/managing_data/loading_and_updating_data.rst index e3185c94..cfa0984c 100644 --- a/docs/source/managing_data/loading_and_updating_data.rst +++ b/docs/source/managing_data/loading_and_updating_data.rst @@ -3,55 +3,13 @@ Loading and updating data ========================= +The Gene Normalizer defines a command line tool for data management. It includes functions for refreshing data, checking database status, and for the PostgreSQL data backend, dumping to a local file and updating from a remote backup. + .. note:: See the :ref:`ETL API documentation` for information on programmatic access to the data loader classes. -Full load/reload ----------------- - -Calling the Gene Normalizer update command with the ``--update_all`` and ``--update_merged`` flags will delete all existing data, fetch new source data if available, and then perform a complete reload of the database (including merged records): - -.. code-block:: shell - - gene_norm_update --update_all --update_merged - - -Reload individual source ------------------------- - -To update specific sources, call the ``--sources`` option with one or more source name(s) quoted and separated by spaces. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the ``--update_merged`` flag as well. - -.. code-block:: shell - - gene_norm_update --sources="HGNC NCBI" --update_merged - - -Use local data --------------- - -The Gene Normalizer will fetch the latest available data from all sources if local data is out-of-date. To suppress this and force usage of local files, use the `--use_existing` flag: - -.. code-block:: shell - - gene_norm_update --update_all --use_existing - - -Check DB health ---------------- - -The shell command ``gene_norm_check_db`` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result (per the UNIX standard, ``0`` means success, and any other return code means failure). - -.. code-block:: console - - $ gene_norm_check_db - $ echo $? - 1 # indicates failure - -This command is equivalent to the combination of the database classes' ``check_schema_initialized`` and ``check_tables_populated`` methods: - -.. code-block:: python - from gene.database import create_db - db = create_db() - db_is_healthy = db.check_schema_initialized() and db.check_tables_populated() +.. click:: gene.cli:cli + :prog: gene-normalizer + :nested: full diff --git a/docs/source/managing_data/postgresql.rst b/docs/source/managing_data/postgresql.rst index 71407fd3..90febbc7 100644 --- a/docs/source/managing_data/postgresql.rst +++ b/docs/source/managing_data/postgresql.rst @@ -24,18 +24,18 @@ Once created, set the environment variable ``GENE_NORM_DB_URL`` to a connection Load from remote source -------------------------------- -The Gene Normalizer's PostgreSQL class provides the ``gene_norm_update_remote`` shell command to refresh its data directly from a remotely-stored SQL dump, instead of acquiring, transforming, and loading source data. This enables data loading on the order of seconds rather than hours. See the command description at ``gene_norm_update_remote --help`` for more information. +The Gene Normalizer's PostgreSQL class provides the ``gene-normalizer update-from-remote`` shell command to refresh its data directly from a remotely-stored SQL dump, instead of acquiring, transforming, and loading source data. This enables data loading on the order of seconds rather than hours. See the command description at ``gene-normalizer update-from-remote --help`` for more information. By default, this command will fetch the `latest data dump `_ provided by the VICC. Alternative URLs can be set with the ``--data_url`` option: :: - gene_norm_update_remote --data_url=https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_20230322163523.sql.tar.gz + gene-normalizer update-from-remote --data_url=https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_20230322163523.sql.tar.gz Create SQL dump from database ----------------------------- -The Gene Normalizer's PostgreSQL class also provides the ``gene_norm_dump`` shell command to create a SQL dump of current data into a file. This command will create a file named ``gene_norm_YYYYMMDDHHmmss.sql`` in the current directory; the ``-o`` option can be used to specify an alternate location, like so: :: +The Gene Normalizer's PostgreSQL class also provides the ``gene-normalizer dump-database`` shell command to create a SQL dump of current data into a file. This command will create a file named ``gene_norm_YYYYMMDDHHmmss.sql`` in the current directory; the ``-o`` option can be used to specify an alternate location, like so: :: - gene_norm_dump -o ~/.gene_data/ + gene-normalizer dump-database -o ~/.gene_data/ -See ``gene_norm_dump --help`` for more information. +See ``gene-normalizer dump-database --help`` for more information. diff --git a/pyproject.toml b/pyproject.toml index 41c8f7fd..df44570c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,8 @@ docs = [ "sphinx-copybutton==0.5.2", "sphinxext-opengraph==0.8.2", "furo==2023.3.27", - "gravis==0.1.0" + "gravis==0.1.0", + "sphinx-click==5.0.1", ] [project.urls] @@ -57,10 +58,7 @@ Source = "https://github.com/cancervariants/gene-normalization" "Bug Tracker" = "https://github.com/cancervariants/gene-normalization/issues" [project.scripts] -gene_norm_update = "gene.cli:update_normalizer_db" -gene_norm_update_remote = "gene.cli:update_from_remote" -gene_norm_dump = "gene.cli:dump_database" -gene_norm_check_db = "gene.cli:check_db" +gene-normalizer = "gene.cli:cli" [build-system] requires = ["setuptools>=61.0"] diff --git a/src/gene/cli.py b/src/gene/cli.py index f7eef4f5..fc4e168c 100644 --- a/src/gene/cli.py +++ b/src/gene/cli.py @@ -2,54 +2,125 @@ import logging import os from pathlib import Path -from timeit import default_timer as timer -from typing import Collection, List, Optional, Set +from typing import Optional, Tuple import click -from gene.database import ( - AbstractDatabase, - DatabaseReadException, - DatabaseWriteException, - create_db, -) +from gene.database import create_db from gene.database.database import DatabaseException -from gene.schemas import SOURCES, SourceName +from gene.etl.update import update_all_sources, update_normalized, update_source +from gene.schemas import SourceName logger = logging.getLogger("gene") logger.setLevel(logging.DEBUG) -@click.command() -@click.option("--db_url", help="URL endpoint for the application database.") -@click.option("--verbose", "-v", is_flag=True, help="Print result to console if set.") -def check_db(db_url: str, verbose: bool = False) -> None: - """Perform basic checks on DB health and population. Exits with status code 1 - if DB schema is uninitialized or if critical tables appear to be empty. +url_description = 'URL endpoint for the application database. Can either be a URL to a local DynamoDB server (e.g. "http://localhost:8001") or a libpq-compliant PostgreSQL connection description (e.g. "postgresql://postgres:password@localhost:5432/gene_normalizer").' - \f - :param db_url: URL to normalizer database - :param verbose: if true, print result to console - """ # noqa: D301 - db = create_db(db_url, False) - if not db.check_schema_initialized(): - if verbose: - click.echo("Health check failed: DB schema uninitialized.") - click.get_current_context().exit(1) - if not db.check_tables_populated(): - if verbose: - click.echo("Health check failed: DB is incompletely populated.") - click.get_current_context().exit(1) +@click.group() +def cli() -> None: + """Manage Gene Normalizer data.""" - if verbose: - click.echo("DB health check successful: tables appear complete.") +@cli.command() +@click.argument("sources", nargs=-1) +@click.option("--all", is_flag=True, help="Update records for all sources.") +@click.option("--normalize", is_flag=True, help="Create normalized records.") +@click.option("--db_url", help=url_description) +@click.option("--aws_instance", is_flag=True, help="Use cloud DynamodDB instance.") +@click.option( + "--use_existing", + is_flag=True, + default=False, + help="Use most recent locally-available source data instead of fetching latest version", +) +@click.option( + "--silent", "-s", is_flag=True, default=False, help="Suppress console output." +) +def update( + sources: Tuple[str], + aws_instance: bool, + db_url: str, + all: bool, + normalize: bool, + use_existing: bool, + silent: bool, +) -> None: + """Update provided normalizer SOURCES in the gene database. + + Valid SOURCES are "HGNC", "NCBI", and "Ensembl" (case is irrelevant). SOURCES are + optional, but if not provided, either --all or --normalize must be used. + + For example, the following command will update NCBI and HGNC source records: + + $ gene-normalizer update HGNC NCBI + + To completely reload all source records and construct normalized concepts, use the + --all and --normalize options: -@click.command() + $ gene-normalizer update --all --normalize + + The Gene Normalizer will fetch the latest available data from all sources if local + data is out-of-date. To suppress this and force usage of local files only, use the + --use_existing flag: + + $ gene-normalizer update --all --use_existing + + \f + :param sources: tuple of raw names of sources to update + :param aws_instance: if true, use cloud instance + :param db_url: URI pointing to database + :param all: if True, update all sources (ignore ``sources``) + :param normalize: if True, update normalized records + :param use_existing: if True, use most recent local data instead of fetching latest version + :param silent: if True, suppress console output + """ # noqa: D301 + if (not sources) and (not all) and (not normalize): + click.echo( + "Error: must provide SOURCES or at least one of --all, --normalize\n" + ) + ctx = click.get_current_context() + click.echo(ctx.get_help()) + ctx.exit(1) + + db = create_db(db_url, aws_instance, silent) + + processed_ids = None + if all: + processed_ids = update_all_sources(db, use_existing, silent=silent) + elif sources: + parsed_sources = set() + failed_source_names = [] + for source in sources: + try: + parsed_sources.add(SourceName[source.upper()]) + except KeyError: + failed_source_names.append(source) + if len(failed_source_names) != 0: + click.echo(f"Error: unrecognized sources: {failed_source_names}") + click.echo(f"Valid source options are {list(SourceName)}") + click.get_current_context().exit(1) + + working_processed_ids = set() + for source_name in parsed_sources: + working_processed_ids |= update_source( + source_name, db, use_existing=use_existing, silent=silent + ) + if len(sources) == len(SourceName): + processed_ids = working_processed_ids + + if normalize: + update_normalized(db, processed_ids, silent=silent) + + +@cli.command() @click.option("--data_url", help="URL to data dump") -@click.option("--db_url", help="URL endpoint for the application database.") -def update_from_remote(data_url: Optional[str], db_url: str) -> None: +@click.option("--db_url", help=url_description) +@click.option( + "--silent", "-s", is_flag=True, default=False, help="Suppress console output." +) +def update_from_remote(data_url: Optional[str], db_url: str, silent: bool) -> None: """Update data from remotely-hosted DB dump. By default, fetches from latest available dump on VICC S3 bucket; specific URLs can be provided instead by command line option or GENE_NORM_REMOTE_DB_URL environment variable. @@ -57,12 +128,13 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None: \f :param data_url: user-specified location to pull DB dump from :param db_url: URL to normalizer database + :param silent: if True, suppress console output """ # noqa: D301 if not click.confirm("Are you sure you want to overwrite existing data?"): click.get_current_context().exit() if not data_url: data_url = os.environ.get("GENE_NORM_REMOTE_DB_URL") - db = create_db(db_url, False) + db = create_db(db_url, False, silent) try: db.load_from_remote(data_url) except NotImplementedError: @@ -75,25 +147,77 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None: click.get_current_context().exit(1) -@click.command() +@cli.command() +@click.option("--db_url", help=url_description) +@click.option( + "--verbose", + "-v", + is_flag=True, + default=False, + help="Print result to console if set.", +) +@click.option( + "--silent", "-s", is_flag=True, default=False, help="Suppress console output." +) +def check_db(db_url: str, verbose: bool, silent: bool) -> None: + """Perform basic checks on DB health and population. Exits with status code 1 + if DB schema is uninitialized or if critical tables appear to be empty. + + $ gene-normalizer check-db + $ echo $? + 1 # indicates failure + + This command is equivalent to the combination of the database classes' + ``check_schema_initialized()`` and ``check_tables_populated()`` methods: + + >>> from gene.database import create_db + >>> db = create_db() + >>> db.check_schema_initialized() and db.check_tables_populated() + True # DB passes checks + + \f + :param db_url: URL to normalizer database + :param verbose: if true, print result to console + :param silent: if True, suppress console output + """ # noqa: D301 + db = create_db(db_url, False, silent) + if not db.check_schema_initialized(): + if verbose: + click.echo("Health check failed: DB schema uninitialized.") + click.get_current_context().exit(1) + + if not db.check_tables_populated(): + if verbose: + click.echo("Health check failed: DB is incompletely populated.") + click.get_current_context().exit(1) + + if verbose: + click.echo("DB health check successful: tables appear complete.") + + +@cli.command() @click.option( "--output_directory", "-o", help="Output location to write to", type=click.Path(exists=True, path_type=Path), ) -@click.option("--db_url", help="URL endpoint for the application database.") -def dump_database(output_directory: Path, db_url: str) -> None: +@click.option("--db_url", help=url_description) +@click.option( + "--silent", "-s", is_flag=True, default=False, help="Suppress console output." +) +def dump_database(output_directory: Path, db_url: str, silent: bool) -> None: """Dump data from database into file. \f :param output_directory: path to existing directory :param db_url: URL to normalizer database + :param silent: if True, suppress console output """ # noqa: D301 if not output_directory: output_directory = Path(".") - db = create_db(db_url, False) + db = create_db(db_url, False, silent) try: db.export_db(output_directory) except NotImplementedError: @@ -106,213 +230,5 @@ def dump_database(output_directory: Path, db_url: str) -> None: click.get_current_context().exit(1) -def _update_normalizer( - sources: Collection[SourceName], - db: AbstractDatabase, - update_merged: bool, - use_existing: bool, -) -> None: - """Update selected normalizer sources. - - :param sources: names of sources to update - :param db: database instance - :param update_merged: if true, retain processed records to use in updating merged - records - :param use_existing: if True, use most recent local version of source data instead of - fetching from remote - """ - processed_ids = list() - for n in sources: - delete_time = _delete_source(n, db) - _load_source(n, db, delete_time, processed_ids, use_existing) - - if update_merged: - _load_merge(db, processed_ids) - - -def _delete_source(n: SourceName, db: AbstractDatabase) -> float: - """Delete individual source data. - - :param n: name of source to delete - :param db: database instance - :return: time taken (in seconds) to delete - """ - msg = f"Deleting {n.value}..." - click.echo(f"\n{msg}") - logger.info(msg) - start_delete = timer() - db.delete_source(n) - end_delete = timer() - delete_time = end_delete - start_delete - msg = f"Deleted {n.value} in {delete_time:.5f} seconds." - click.echo(f"{msg}\n") - logger.info(msg) - return delete_time - - -_etl_dependency_help = "Are ETL dependencies installed? See the Installation page in the documentation for more info." - - -def _load_source( - n: SourceName, - db: AbstractDatabase, - delete_time: float, - processed_ids: List[str], - use_existing: bool, -) -> None: - """Load individual source data. - - :param n: name of source - :param db: database instance - :param delete_time: time taken (in seconds) to run deletion - :param processed_ids: in-progress list of processed gene IDs - :param use_existing: if True, use most recent local data files instead of - fetching from remote - """ - msg = f"Loading {n.value}..." - click.echo(msg) - logger.info(msg) - start_load = timer() - - # used to get source class name from string - try: - from gene.etl import HGNC, NCBI, Ensembl # noqa: F401 - from gene.etl.exceptions import GeneNormalizerEtlError - except ModuleNotFoundError as e: - click.echo( - f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}" - ) - click.get_current_context().exit() - SourceClass = eval(n.value) # noqa: N806 - - source = SourceClass(database=db, silent=False) - try: - processed_ids += source.perform_etl(use_existing) - except GeneNormalizerEtlError as e: - logger.error(e) - click.echo(f"Encountered error while loading {n}: {e}.") - click.get_current_context().exit() - end_load = timer() - load_time = end_load - start_load - msg = f"Loaded {n.value} in {load_time:.5f} seconds." - click.echo(msg) - logger.info(msg) - msg = f"Total time for {n.value}: {(delete_time + load_time):.5f} seconds." - click.echo(msg) - logger.info(msg) - - -def _delete_normalized_data(database: AbstractDatabase) -> None: - """Delete normalized concepts - - :param database: DB instance - """ - click.echo("\nDeleting normalized records...") - start_delete = timer() - try: - database.delete_normalized_concepts() - except (DatabaseReadException, DatabaseWriteException) as e: - click.echo(f"Encountered exception during normalized data deletion: {e}") - end_delete = timer() - delete_time = end_delete - start_delete - click.echo(f"Deleted normalized records in {delete_time:.5f} seconds.") - - -def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: - """Load merged concepts - - :param db: database instance - :param processed_ids: in-progress list of processed gene IDs - """ - start = timer() - _delete_normalized_data(db) - if not processed_ids: - processed_ids = db.get_all_concept_ids() - - try: - from gene.etl.merge import Merge - except ModuleNotFoundError as e: - click.echo( - f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}" - ) - click.get_current_context().exit() - - merge = Merge(database=db) - click.echo("Constructing normalized records...") - merge.create_merged_concepts(processed_ids) - end = timer() - click.echo( - f"Merged concept generation completed in " f"{(end - start):.5f} seconds" - ) - - -@click.command() -@click.option("--sources", help="The source(s) you wish to update separated by spaces.") -@click.option("--aws_instance", is_flag=True, help="Using AWS DynamodDB instance.") -@click.option("--db_url", help="URL endpoint for the application database.") -@click.option("--update_all", is_flag=True, help="Update all normalizer sources.") -@click.option( - "--update_merged", - is_flag=True, - help="Update concepts for normalize endpoint from accepted sources.", -) -@click.option( - "--use_existing", - is_flag=True, - default=False, - help="Use most recent local source data instead of fetching latest version", -) -def update_normalizer_db( - sources: str, - aws_instance: bool, - db_url: str, - update_all: bool, - update_merged: bool, - use_existing: bool, -) -> None: - """Update selected normalizer source(s) in the gene database. For example, the - following command will update NCBI and HGNC data, using a database connection at port 8001: - - % gene_norm_update --sources="NCBI HGNC" --db_url=http://localhost:8001 - - See the documentation for more exhaustive information. - - \f - :param sources: names of sources to update, comma-separated - :param aws_instance: if true, use cloud instance - :param db_url: URI pointing to database - :param update_all: if true, update all sources (ignore `normalizer` parameter) - :param update_merged: if true, update normalized records - :param use_existing: if True, use most recent local data instead of fetching latest version - """ # noqa: D301 - db = create_db(db_url, aws_instance) - - if update_all: - _update_normalizer(list(SourceName), db, update_merged, use_existing) - elif not sources: - if update_merged: - _load_merge(db, set()) - else: - ctx = click.get_current_context() - click.echo( - "Must either enter 1 or more sources, or use `--update_all` parameter" - ) # noqa: E501 - click.echo(ctx.get_help()) - ctx.exit() - else: - sources_split = sources.lower().split() - - if len(sources_split) == 0: - raise Exception("Must enter 1 or more source names to update") - - non_sources = set(sources_split) - set(SOURCES) - - if len(non_sources) != 0: - raise Exception(f"Not valid source(s): {non_sources}") - - parsed_source_names = {SourceName(SOURCES[s]) for s in sources_split} - _update_normalizer(parsed_source_names, db, update_merged, use_existing) - - if __name__ == "__main__": - update_normalizer_db() + cli() diff --git a/src/gene/database/database.py b/src/gene/database/database.py index 67bcafd6..76cb7acc 100644 --- a/src/gene/database/database.py +++ b/src/gene/database/database.py @@ -260,8 +260,11 @@ class AwsEnvName(str, Enum): VALID_AWS_ENV_NAMES = {v.value for v in AwsEnvName.__members__.values()} -def confirm_aws_db_use(env_name: str) -> None: - """Check to ensure that AWS instance should actually be used.""" +def confirm_aws_db_use(env_name: AwsEnvName) -> None: + """Check to ensure that AWS instance should actually be used. + + :param env_name: name of database environment. + """ if click.confirm( f"Are you sure you want to use the AWS {env_name} database?", default=False ): @@ -272,7 +275,7 @@ def confirm_aws_db_use(env_name: str) -> None: def create_db( - db_url: Optional[str] = None, aws_instance: bool = False + db_url: Optional[str] = None, aws_instance: bool = False, silent: bool = False ) -> AbstractDatabase: """Database factory method. Checks environment variables and provided parameters and creates a DB instance. @@ -313,6 +316,7 @@ def create_db( :param db_url: address to database instance :param aws_instance: use hosted DynamoDB instance, not local DB + :param silent: if True, suppress console output :return: constructed Database instance """ aws_env_var_set = AWS_ENV_VAR_NAME in environ @@ -320,7 +324,7 @@ def create_db( if aws_env_var_set or aws_instance: from gene.database.dynamodb import DynamoDbDatabase - db = DynamoDbDatabase() + db = DynamoDbDatabase(silent=silent) else: if db_url: endpoint_url = db_url @@ -333,9 +337,9 @@ def create_db( if endpoint_url.startswith("postgres"): from gene.database.postgresql import PostgresDatabase - db = PostgresDatabase(endpoint_url) + db = PostgresDatabase(endpoint_url, silent=silent) else: from gene.database.dynamodb import DynamoDbDatabase - db = DynamoDbDatabase(endpoint_url) + db = DynamoDbDatabase(endpoint_url, silent=silent) return db diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py index 161972e1..e33c98ec 100644 --- a/src/gene/database/dynamodb.py +++ b/src/gene/database/dynamodb.py @@ -43,6 +43,7 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: :param str db_url: URL endpoint for DynamoDB source :Keyword Arguments: * region_name: AWS region (defaults to "us-east-2") + * silent: if True, suppress console output :raise DatabaseInitializationException: if initial setup fails """ self.gene_table = environ.get("GENE_DYNAMO_TABLE", "gene_normalizer") @@ -53,18 +54,17 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: raise DatabaseInitializationException( f"Cannot have both GENE_TEST and {AWS_ENV_VAR_NAME} set." ) # noqa: E501 - - aws_env = environ[AWS_ENV_VAR_NAME] - if aws_env not in VALID_AWS_ENV_NAMES: + try: + aws_env = AwsEnvName(environ[AWS_ENV_VAR_NAME]) + except ValueError: raise DatabaseInitializationException( - f"{AWS_ENV_VAR_NAME} must be one of {VALID_AWS_ENV_NAMES}" - ) # noqa: E501 - + f"{AWS_ENV_VAR_NAME} must be one of {VALID_AWS_ENV_NAMES}: found {environ[AWS_ENV_VAR_NAME]} instead." + ) skip_confirmation = environ.get(SKIP_AWS_DB_ENV_NAME) if (not skip_confirmation) or ( skip_confirmation and skip_confirmation != "true" ): # noqa: E501 - confirm_aws_db_use(environ[AWS_ENV_VAR_NAME]) + confirm_aws_db_use(aws_env) boto_params = {"region_name": region_name} @@ -79,7 +79,10 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: endpoint_url = environ["GENE_NORM_DB_URL"] else: endpoint_url = "http://localhost:8000" - click.echo(f"***Using Gene Database Endpoint: {endpoint_url}***") + if db_args.get("silent") != True: # noqa: E712 + click.echo( + f"***Using Gene-Normalizer DynamoDB endpoint: {endpoint_url}***" + ) boto_params = {"region_name": region_name, "endpoint_url": endpoint_url} self.dynamodb = boto3.resource("dynamodb", **boto_params) diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py index f62a1819..82149cb9 100644 --- a/src/gene/database/postgresql.py +++ b/src/gene/database/postgresql.py @@ -9,6 +9,7 @@ from pathlib import Path from typing import Any, Dict, Generator, List, Optional, Set, Tuple +import click import psycopg import requests from psycopg.errors import ( @@ -51,6 +52,7 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: * user: Postgres username * password: Postgres password (optional or blank if unneeded) * db_name: name of database to connect to + * silent: if True, suppress console output :raise DatabaseInitializationException: if initial setup fails """ @@ -67,6 +69,9 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: else: conninfo = f"dbname={db_name} user={user}" + if db_args.get("silent") != True: # noqa: E712 + click.echo(f"***Using Gene-Normalizer PostgreSQL connection: {conninfo}***") + self.conn = psycopg.connect(conninfo) self.initialize_db() self._cached_sources = {} diff --git a/src/gene/etl/update.py b/src/gene/etl/update.py new file mode 100644 index 00000000..489ccd70 --- /dev/null +++ b/src/gene/etl/update.py @@ -0,0 +1,217 @@ +"""Provide functions to perform Gene Normalizer updates.""" +import logging +from timeit import default_timer as timer +from typing import List, Optional, Set, Tuple + +import click + +from gene.database.database import ( + AbstractDatabase, + DatabaseReadException, + DatabaseWriteException, +) +from gene.schemas import SourceName + +_logger = logging.getLogger(__name__) + + +def delete_source( + source: SourceName, db: AbstractDatabase, silent: bool = True +) -> float: + """Delete all data for an individual source. + + :param source: name of source to delete data for + :param db: database instance + :param silent: if True, suppress console output + :return: time spent deleting source + """ + msg = f"Deleting {source.value}..." + if not silent: + click.echo(f"\n{msg}") + _logger.info(msg) + start_delete = timer() + db.delete_source(source) + end_delete = timer() + delete_time = end_delete - start_delete + msg = f"Deleted {source.value} in {delete_time:.5f} seconds." + if not silent: + click.echo(f"{msg}") + _logger.info(msg) + return delete_time + + +_etl_dependency_help = "Are ETL dependencies installed? See the Installation page in the documentation for more info." + + +def load_source( + source: SourceName, db: AbstractDatabase, use_existing: bool, silent: bool = True +) -> Tuple[float, Set[str]]: + """Load data for an individual source. + + :param source: name of source to load data for + :param db: database instance + :param use_existing: if True, use latest available version of local data + :param silent: if True, suppress console output + :return: time spent loading data, and set of processed IDs from that source + """ + msg = f"Loading {source.value}..." + if not silent: + click.echo(msg) + _logger.info(msg) + start_load = timer() + + # used to get source class name from string + try: + from gene.etl import HGNC, NCBI, Ensembl # noqa: F401 + from gene.etl.exceptions import GeneNormalizerEtlError + except ModuleNotFoundError as e: + click.echo( + f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}" + ) + click.get_current_context().exit() + sources_table = { + SourceName.HGNC: HGNC, + SourceName.ENSEMBL: Ensembl, + SourceName.NCBI: NCBI, + } + + source_class = sources_table[source](database=db) + try: + processed_ids = source_class.perform_etl(use_existing) + except GeneNormalizerEtlError as e: + _logger.error(e) + click.echo(f"Encountered error while loading {source}: {e}.") + click.get_current_context().exit() + end_load = timer() + load_time = end_load - start_load + msg = f"Loaded {source.value} in {load_time:.5f} seconds." + if not silent: + click.echo(msg) + _logger.info(msg) + + return (load_time, set(processed_ids)) + + +def update_source( + source: SourceName, db: AbstractDatabase, use_existing: bool, silent: bool = True +) -> Set[str]: + """Refresh data for an individual gene data source. + + For example, to completely refresh HGNC data: + + >>> from gene.schemas import SourceName + >>> from gene.database import create_db + >>> from gene.etl.update import update_source + >>> db = create_db() + >>> processed_ids = update_source(SourceName.HGNC, db) + + :param source: name of source to update + :param db: database instance + :param use_existing: if True, use latest available local data + :param silent: if True, suppress console output + :return: IDs for records created from source + """ + delete_time = delete_source(source, db, silent) + load_time, processed_ids = load_source(source, db, use_existing, silent) + msg = f"Total time for {source.value}: {(delete_time + load_time):.5f} seconds." + if not silent: + click.echo(msg) + _logger.info(msg) + return processed_ids + + +def update_all_sources( + db: AbstractDatabase, use_existing: bool, silent: bool = True +) -> Set[str]: + """Refresh data for all gene record sources. + + :param db: database instance + :param use_existing: if True, use latest available local data for all sources + :param silent: if True, suppress console output + :return: IDs processed from all sources + """ + processed_ids: List[str] = [] + for source in SourceName: + source_ids = update_source(source, db, use_existing, silent) + processed_ids += list(source_ids) + return set(processed_ids) + + +def delete_normalized(database: AbstractDatabase, silent: bool = True) -> None: + """Delete normalized concepts. + + :param database: DB instance + :param silent: if True, suppress console output + """ + msg = "\nDeleting normalized records..." + _logger.info(msg) + if not silent: + click.echo(msg) + start_delete = timer() + try: + database.delete_normalized_concepts() + except (DatabaseReadException, DatabaseWriteException) as e: + click.echo(f"Encountered exception during normalized data deletion: {e}") + raise e + end_delete = timer() + delete_time = end_delete - start_delete + msg = f"Deleted normalized records in {delete_time:.5f} seconds." + if not silent: + click.echo(msg) + _logger.info(msg) + + +def update_normalized( + db: AbstractDatabase, processed_ids: Optional[Set[str]], silent: bool = True +) -> None: + """Delete existing and update merged normalized records. + + :param db: database instance + :param processed_ids: IDs to form normalized records from. Provide if available to + cut down on some potentially slow database calls. If unavailable, this method + will fetch all known IDs directly. + :param silent: if True, suppress console output + """ + start = timer() + delete_normalized(db, silent) + if not processed_ids: + processed_ids = db.get_all_concept_ids() + + try: + from gene.etl.merge import Merge + except ModuleNotFoundError as e: + msg = f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}" + if not silent: + click.echo(msg) + _logger.error(msg) + click.get_current_context().exit() + + merge = Merge(database=db) + if not silent: + click.echo("Constructing normalized records...") + merge.create_merged_concepts(processed_ids) + end = timer() + msg = f"Merged concept generation completed in " f"{(end - start):.5f} seconds" + if not silent: + click.echo(msg) + _logger.info(msg) + + +def update_all_and_normalize( + db: AbstractDatabase, use_existing: bool, silent: bool = True +) -> None: + """Update all sources as well as normalized records. + + For example, to completely refresh all Gene Normalizer data: + + >>> from gene.database import create_db + >>> from gene.etl.update import update_all_and_normalize + >>> db = create_db() + >>> update_all_and_normalize(db, False) + + :param db: database instance + :param use_existing: if True, use latest local copy of data + :param silent: if True, suppress console output + """ + processed_ids = update_all_sources(db, use_existing, silent) + update_normalized(db, processed_ids, silent)