Skip to content

Commit

Permalink
enhancement!: CLI overhaul (#299)
Browse files Browse the repository at this point in the history
Co-authored-by: Kori Kuzma <[email protected]>
  • Loading branch information
jsstevenson and korikuzma authored Dec 13, 2023
1 parent e64953d commit 0e4f7d9
Show file tree
Hide file tree
Showing 11 changed files with 531 additions and 328 deletions.
10 changes: 4 additions & 6 deletions docs/source/api/etl_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@
Source ETL API
==============

Base
----
Update methods
--------------

.. autoclass:: gene.etl.base.Base
:members:
:special-members: __init__
:undoc-members:
.. automodule:: gene.etl.update
:members:

Exceptions
----------
Expand Down
106 changes: 105 additions & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"sphinx_autodoc_typehints",
"sphinx.ext.linkcode",
"sphinx_copybutton",
"sphinx_click",
]

templates_path = ["_templates"]
Expand Down Expand Up @@ -77,9 +78,112 @@ def linkcode_resolve(domain, info):
if not info["module"]:
return None
filename = info["module"].replace(".", "/")
return f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py" # noqa: E501
return (
f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py" # noqa: E501
)


# -- code block style --------------------------------------------------------
pygments_style = "default"
pygements_dark_style = "monokai"

# -- sphinx-click ------------------------------------------------------------
# These functions let us write descriptions/docstrings in a way that doesn't look
# weird in the Click CLI, but get additional formatting in the sphinx-click autodocs for
# better readability.
from typing import List
import re

from click.core import Context
from sphinx.application import Sphinx
from sphinx_click.ext import _get_usage, _format_usage, _indent

CMD_PATTERN = r"--[^ ]+"
STR_PATTERN = r"\"[^ ]+\""
SNAKE_PATTERN = r"[A-Z]+_[A-Z_]*[A-Z]"


def _add_formatting_to_string(line: str) -> str:
"""Add fixed-width code formatting to span sections in lines:
* shell options, eg `--update_all`
* double-quoted strings, eg `"HGNC"`
* all caps SNAKE_CASE env vars, eg `GENE_NORM_REMOTE_DB_URL`
"""
for pattern in (CMD_PATTERN, STR_PATTERN, SNAKE_PATTERN):
line = re.sub(pattern, lambda x: f"``{x.group()}``", line)
return line


def process_description(app: Sphinx, ctx: Context, lines: List[str]):
"""Add custom formatting to sphinx-click autodoc descriptions.
* remove :param: :return: etc
* add fixed-width (code) font to certain words
* add code block formatting to example shell commands
* move primary usage example to the top of the description
Because we have to modify the lines list in place, we have to make multiple passes
through it to format everything correctly.
"""
if not lines:
return

# chop off params
param_boundary = None
for i, line in enumerate(lines):
if ":param" in line:
param_boundary = i
break
if param_boundary is not None:
del lines[param_boundary:]
lines[-1] = ""

# add code formatting to strings, commands, and env vars
lines_to_fmt = []
for i, line in enumerate(lines):
if line.startswith(" ") or line.startswith(">>> "):
continue # skip example code blocks
if any(
[
re.findall(CMD_PATTERN, line),
re.findall(STR_PATTERN, line),
re.findall(SNAKE_PATTERN, line),
]
):
lines_to_fmt.append(i)
for line_num in lines_to_fmt:
lines[line_num] = _add_formatting_to_string(lines[line_num])

# add code block formatting to example console commands
for i in range(len(lines) - 1, -1, -1):
if lines[i].startswith(" "):
lines.insert(i + 2, "")
if i == 0 or not lines[i - 1].startswith(" "):
lines.insert(i, "")
lines.insert(i, ".. code-block:: console")

# put usage at the top of the description
lines.insert(0, "")
for usage_line in _get_usage(ctx).splitlines()[::-1]:
lines.insert(0, _indent(usage_line))
lines.insert(0, "")
lines.insert(0, ".. code-block:: shell")


def process_option(app: Sphinx, ctx: Context, lines: List[str]):
"""Add fixed-width formatting to strings in sphinx-click autodoc options."""
for i, line in enumerate(lines):
if re.findall(STR_PATTERN, line):
lines[i] = re.sub(STR_PATTERN, lambda x: f"``{x.group()}``", line)


def setup(app):
"""Used to hook format customization into sphinx-click build.
In particular, since we move usage to the top of the command description, we need
an extra hook here to silence the built-in usage section.
"""
app.connect("sphinx-click-process-description", process_description)
app.connect("sphinx-click-process-options", process_option)
app.connect("sphinx-click-process-usage", lambda app, ctx, lines: lines.clear())
6 changes: 3 additions & 3 deletions docs/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ Set the environment variable ``GENE_NORM_DB_URL`` to a connection description fo
Load data
+++++++++

Use the ``gene_norm_update_remote`` shell command to load data from the most recent remotely-stored data dump: ::
Use the ``gene-normalizer update-from-remote`` shell command to load data from the most recent remotely-stored data dump: ::

gene_norm_update_remote
gene-normalizer update-from-remote

Start service
+++++++++++++
Expand Down Expand Up @@ -145,7 +145,7 @@ Load data

To load all source data, and then generate normalized records, use the following shell command: ::

gene_norm_update --update_all --update_merged
gene-normalizer update --all --normalize

This will download the latest available versions of all source data files, extract and transform recognized gene concepts, load them into the database, and construct normalized concept groups. For more specific update commands, see :ref:`Loading and updating data <loading_and_updating_data>`.

Expand Down
52 changes: 5 additions & 47 deletions docs/source/managing_data/loading_and_updating_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,55 +3,13 @@
Loading and updating data
=========================

The Gene Normalizer defines a command line tool for data management. It includes functions for refreshing data, checking database status, and for the PostgreSQL data backend, dumping to a local file and updating from a remote backup.

.. note::

See the :ref:`ETL API documentation<etl-api>` for information on programmatic access to the data loader classes.

Full load/reload
----------------

Calling the Gene Normalizer update command with the ``--update_all`` and ``--update_merged`` flags will delete all existing data, fetch new source data if available, and then perform a complete reload of the database (including merged records):

.. code-block:: shell
gene_norm_update --update_all --update_merged
Reload individual source
------------------------

To update specific sources, call the ``--sources`` option with one or more source name(s) quoted and separated by spaces. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the ``--update_merged`` flag as well.

.. code-block:: shell
gene_norm_update --sources="HGNC NCBI" --update_merged
Use local data
--------------

The Gene Normalizer will fetch the latest available data from all sources if local data is out-of-date. To suppress this and force usage of local files, use the `--use_existing` flag:

.. code-block:: shell
gene_norm_update --update_all --use_existing
Check DB health
---------------

The shell command ``gene_norm_check_db`` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result (per the UNIX standard, ``0`` means success, and any other return code means failure).

.. code-block:: console
$ gene_norm_check_db
$ echo $?
1 # indicates failure
This command is equivalent to the combination of the database classes' ``check_schema_initialized`` and ``check_tables_populated`` methods:

.. code-block:: python

from gene.database import create_db
db = create_db()
db_is_healthy = db.check_schema_initialized() and db.check_tables_populated()
.. click:: gene.cli:cli
:prog: gene-normalizer
:nested: full
10 changes: 5 additions & 5 deletions docs/source/managing_data/postgresql.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,18 @@ Once created, set the environment variable ``GENE_NORM_DB_URL`` to a connection
Load from remote source
--------------------------------

The Gene Normalizer's PostgreSQL class provides the ``gene_norm_update_remote`` shell command to refresh its data directly from a remotely-stored SQL dump, instead of acquiring, transforming, and loading source data. This enables data loading on the order of seconds rather than hours. See the command description at ``gene_norm_update_remote --help`` for more information.
The Gene Normalizer's PostgreSQL class provides the ``gene-normalizer update-from-remote`` shell command to refresh its data directly from a remotely-stored SQL dump, instead of acquiring, transforming, and loading source data. This enables data loading on the order of seconds rather than hours. See the command description at ``gene-normalizer update-from-remote --help`` for more information.

By default, this command will fetch the `latest data dump <https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_latest.sql.tar.gz>`_ provided by the VICC. Alternative URLs can be set with the ``--data_url`` option: ::

gene_norm_update_remote --data_url=https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_20230322163523.sql.tar.gz
gene-normalizer update-from-remote --data_url=https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_20230322163523.sql.tar.gz


Create SQL dump from database
-----------------------------

The Gene Normalizer's PostgreSQL class also provides the ``gene_norm_dump`` shell command to create a SQL dump of current data into a file. This command will create a file named ``gene_norm_YYYYMMDDHHmmss.sql`` in the current directory; the ``-o`` option can be used to specify an alternate location, like so: ::
The Gene Normalizer's PostgreSQL class also provides the ``gene-normalizer dump-database`` shell command to create a SQL dump of current data into a file. This command will create a file named ``gene_norm_YYYYMMDDHHmmss.sql`` in the current directory; the ``-o`` option can be used to specify an alternate location, like so: ::

gene_norm_dump -o ~/.gene_data/
gene-normalizer dump-database -o ~/.gene_data/

See ``gene_norm_dump --help`` for more information.
See ``gene-normalizer dump-database --help`` for more information.
8 changes: 3 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ docs = [
"sphinx-copybutton==0.5.2",
"sphinxext-opengraph==0.8.2",
"furo==2023.3.27",
"gravis==0.1.0"
"gravis==0.1.0",
"sphinx-click==5.0.1",
]

[project.urls]
Expand All @@ -57,10 +58,7 @@ Source = "https://github.com/cancervariants/gene-normalization"
"Bug Tracker" = "https://github.com/cancervariants/gene-normalization/issues"

[project.scripts]
gene_norm_update = "gene.cli:update_normalizer_db"
gene_norm_update_remote = "gene.cli:update_from_remote"
gene_norm_dump = "gene.cli:dump_database"
gene_norm_check_db = "gene.cli:check_db"
gene-normalizer = "gene.cli:cli"

[build-system]
requires = ["setuptools>=61.0"]
Expand Down
Loading

0 comments on commit 0e4f7d9

Please sign in to comment.