enhancement!: CLI overhaul (#299)

Co-authored-by: Kori Kuzma <[email protected]>
cancervariants · Dec 13, 2023 · 0e4f7d9 · 0e4f7d9
1 parent e64953d
commit 0e4f7d9
Show file tree

Hide file tree

Showing 11 changed files with 531 additions and 328 deletions.
diff --git a/docs/source/api/etl_api.rst b/docs/source/api/etl_api.rst
@@ -3,13 +3,11 @@
 Source ETL API
 ==============
 
-Base
-----
+Update methods
+--------------
 
-.. autoclass:: gene.etl.base.Base
-    :members:
-    :special-members: __init__
-    :undoc-members:
+.. automodule:: gene.etl.update
+   :members:
 
 Exceptions
 ----------

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -19,6 +19,7 @@
     "sphinx_autodoc_typehints",
     "sphinx.ext.linkcode",
     "sphinx_copybutton",
+    "sphinx_click",
 ]
 
 templates_path = ["_templates"]
@@ -77,9 +78,112 @@ def linkcode_resolve(domain, info):
     if not info["module"]:
         return None
     filename = info["module"].replace(".", "/")
-    return f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py"  # noqa: E501
+    return (
+        f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py"  # noqa: E501
+    )
 
 
 # -- code block style --------------------------------------------------------
 pygments_style = "default"
 pygements_dark_style = "monokai"
+
+# -- sphinx-click ------------------------------------------------------------
+# These functions let us write descriptions/docstrings in a way that doesn't look
+# weird in the Click CLI, but get additional formatting in the sphinx-click autodocs for
+# better readability.
+from typing import List
+import re
+
+from click.core import Context
+from sphinx.application import Sphinx
+from sphinx_click.ext import _get_usage, _format_usage, _indent
+
+CMD_PATTERN = r"--[^ ]+"
+STR_PATTERN = r"\"[^ ]+\""
+SNAKE_PATTERN = r"[A-Z]+_[A-Z_]*[A-Z]"
+
+
+def _add_formatting_to_string(line: str) -> str:
+    """Add fixed-width code formatting to span sections in lines:
+
+    * shell options, eg `--update_all`
+    * double-quoted strings, eg `"HGNC"`
+    * all caps SNAKE_CASE env vars, eg `GENE_NORM_REMOTE_DB_URL`
+    """
+    for pattern in (CMD_PATTERN, STR_PATTERN, SNAKE_PATTERN):
+        line = re.sub(pattern, lambda x: f"``{x.group()}``", line)
+    return line
+
+
+def process_description(app: Sphinx, ctx: Context, lines: List[str]):
+    """Add custom formatting to sphinx-click autodoc descriptions.
+
+    * remove :param: :return: etc
+    * add fixed-width (code) font to certain words
+    * add code block formatting to example shell commands
+    * move primary usage example to the top of the description
+
+    Because we have to modify the lines list in place, we have to make multiple passes
+    through it to format everything correctly.
+    """
+    if not lines:
+        return
+
+    # chop off params
+    param_boundary = None
+    for i, line in enumerate(lines):
+        if ":param" in line:
+            param_boundary = i
+            break
+    if param_boundary is not None:
+        del lines[param_boundary:]
+        lines[-1] = ""
+
+    # add code formatting to strings, commands, and env vars
+    lines_to_fmt = []
+    for i, line in enumerate(lines):
+        if line.startswith("   ") or line.startswith(">>> "):
+            continue  # skip example code blocks
+        if any(
+            [
+                re.findall(CMD_PATTERN, line),
+                re.findall(STR_PATTERN, line),
+                re.findall(SNAKE_PATTERN, line),
+            ]
+        ):
+            lines_to_fmt.append(i)
+    for line_num in lines_to_fmt:
+        lines[line_num] = _add_formatting_to_string(lines[line_num])
+
+    # add code block formatting to example console commands
+    for i in range(len(lines) - 1, -1, -1):
+        if lines[i].startswith("    "):
+            lines.insert(i + 2, "")
+            if i == 0 or not lines[i - 1].startswith("    "):
+                lines.insert(i, "")
+                lines.insert(i, ".. code-block:: console")
+
+    # put usage at the top of the description
+    lines.insert(0, "")
+    for usage_line in _get_usage(ctx).splitlines()[::-1]:
+        lines.insert(0, _indent(usage_line))
+    lines.insert(0, "")
+    lines.insert(0, ".. code-block:: shell")
+
+
+def process_option(app: Sphinx, ctx: Context, lines: List[str]):
+    """Add fixed-width formatting to strings in sphinx-click autodoc options."""
+    for i, line in enumerate(lines):
+        if re.findall(STR_PATTERN, line):
+            lines[i] = re.sub(STR_PATTERN, lambda x: f"``{x.group()}``", line)
+
+
+def setup(app):
+    """Used to hook format customization into sphinx-click build.
+
+    In particular, since we move usage to the top of the command description, we need
+    an extra hook here to silence the built-in usage section.
+    """
+    app.connect("sphinx-click-process-description", process_description)
+    app.connect("sphinx-click-process-options", process_option)
+    app.connect("sphinx-click-process-usage", lambda app, ctx, lines: lines.clear())
diff --git a/docs/source/install.rst b/docs/source/install.rst
@@ -48,9 +48,9 @@ Set the environment variable ``GENE_NORM_DB_URL`` to a connection description fo
 Load data
 +++++++++
 
-Use the ``gene_norm_update_remote`` shell command to load data from the most recent remotely-stored data dump: ::
+Use the ``gene-normalizer update-from-remote`` shell command to load data from the most recent remotely-stored data dump: ::
 
-    gene_norm_update_remote
+    gene-normalizer update-from-remote
 
 Start service
 +++++++++++++
@@ -145,7 +145,7 @@ Load data
 
 To load all source data, and then generate normalized records, use the following shell command: ::
 
-    gene_norm_update --update_all --update_merged
+    gene-normalizer update --all --normalize
 
 This will download the latest available versions of all source data files, extract and transform recognized gene concepts, load them into the database, and construct normalized concept groups. For more specific update commands, see :ref:`Loading and updating data <loading_and_updating_data>`.
 

diff --git a/docs/source/managing_data/loading_and_updating_data.rst b/docs/source/managing_data/loading_and_updating_data.rst
@@ -3,55 +3,13 @@
 Loading and updating data
 =========================
 
+The Gene Normalizer defines a command line tool for data management. It includes functions for refreshing data, checking database status, and for the PostgreSQL data backend, dumping to a local file and updating from a remote backup.
+
 .. note::
 
     See the :ref:`ETL API documentation<etl-api>` for information on programmatic access to the data loader classes.
 
-Full load/reload
-----------------
-
-Calling the Gene Normalizer update command with the ``--update_all`` and ``--update_merged`` flags will delete all existing data, fetch new source data if available, and then perform a complete reload of the database (including merged records):
-
-.. code-block:: shell
-
-    gene_norm_update --update_all --update_merged
-
-
-Reload individual source
-------------------------
-
-To update specific sources, call the ``--sources`` option with one or more source name(s) quoted and separated by spaces. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the ``--update_merged`` flag as well.
-
-.. code-block:: shell
-
-    gene_norm_update --sources="HGNC NCBI" --update_merged
-
-
-Use local data
---------------
-
-The Gene Normalizer will fetch the latest available data from all sources if local data is out-of-date. To suppress this and force usage of local files, use the `--use_existing` flag:
-
-.. code-block:: shell
-
-    gene_norm_update --update_all --use_existing
-
-
-Check DB health
----------------
-
-The shell command ``gene_norm_check_db`` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result (per the UNIX standard, ``0`` means success, and any other return code means failure).
-
-.. code-block:: console
-
-    $ gene_norm_check_db
-    $ echo $?
-    1  # indicates failure
-
-This command is equivalent to the combination of the database classes' ``check_schema_initialized`` and ``check_tables_populated`` methods:
-
-.. code-block:: python
 
-   from gene.database import create_db
-   db = create_db()
-   db_is_healthy = db.check_schema_initialized() and db.check_tables_populated()
+.. click:: gene.cli:cli
+   :prog: gene-normalizer
+   :nested: full
diff --git a/docs/source/managing_data/postgresql.rst b/docs/source/managing_data/postgresql.rst
@@ -24,18 +24,18 @@ Once created, set the environment variable ``GENE_NORM_DB_URL`` to a connection
 Load from remote source
 --------------------------------
 
-The Gene Normalizer's PostgreSQL class provides the ``gene_norm_update_remote`` shell command to refresh its data directly from a remotely-stored SQL dump, instead of acquiring, transforming, and loading source data. This enables data loading on the order of seconds rather than hours. See the command description at ``gene_norm_update_remote --help`` for more information.
+The Gene Normalizer's PostgreSQL class provides the ``gene-normalizer update-from-remote`` shell command to refresh its data directly from a remotely-stored SQL dump, instead of acquiring, transforming, and loading source data. This enables data loading on the order of seconds rather than hours. See the command description at ``gene-normalizer update-from-remote --help`` for more information.
 
 By default, this command will fetch the `latest data dump <https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_latest.sql.tar.gz>`_ provided by the VICC. Alternative URLs can be set with the ``--data_url`` option: ::
 
-    gene_norm_update_remote --data_url=https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_20230322163523.sql.tar.gz
+    gene-normalizer update-from-remote --data_url=https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_20230322163523.sql.tar.gz
 
 
 Create SQL dump from database
 -----------------------------
 
-The Gene Normalizer's PostgreSQL class also provides the ``gene_norm_dump`` shell command to create a SQL dump of current data into a file. This command will create a file named ``gene_norm_YYYYMMDDHHmmss.sql`` in the current directory; the ``-o`` option can be used to specify an alternate location, like so: ::
+The Gene Normalizer's PostgreSQL class also provides the ``gene-normalizer dump-database`` shell command to create a SQL dump of current data into a file. This command will create a file named ``gene_norm_YYYYMMDDHHmmss.sql`` in the current directory; the ``-o`` option can be used to specify an alternate location, like so: ::
 
-    gene_norm_dump -o ~/.gene_data/
+    gene-normalizer dump-database -o ~/.gene_data/
 
-See ``gene_norm_dump --help`` for more information.
+See ``gene-normalizer dump-database --help`` for more information.
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,7 +46,8 @@ docs = [
     "sphinx-copybutton==0.5.2",
     "sphinxext-opengraph==0.8.2",
     "furo==2023.3.27",
-    "gravis==0.1.0"
+    "gravis==0.1.0",
+    "sphinx-click==5.0.1",
 ]
 
 [project.urls]
@@ -57,10 +58,7 @@ Source = "https://github.com/cancervariants/gene-normalization"
 "Bug Tracker" = "https://github.com/cancervariants/gene-normalization/issues"
 
 [project.scripts]
-gene_norm_update = "gene.cli:update_normalizer_db"
-gene_norm_update_remote = "gene.cli:update_from_remote"
-gene_norm_dump = "gene.cli:dump_database"
-gene_norm_check_db = "gene.cli:check_db"
+gene-normalizer = "gene.cli:cli"
 
 [build-system]
 requires = ["setuptools>=61.0"]