From 42b10783c25882abaa8de3ab602e04a759ddf761 Mon Sep 17 00:00:00 2001 From: Mike McKiernan Date: Sat, 1 Feb 2025 08:27:02 -0500 Subject: [PATCH] chore(docs): Enable a minimal docs linkcheck build Elimiate false positives with script Signed-off-by: Mike McKiernan --- .github/workflows/gh-docs.yml | 27 +++++++++++++++- docs/README.md | 51 ++++++++++++++++++++++++++++++ docs/check_for_broken_links.sh | 50 +++++++++++++++++++++++++++++ docs/false_positives.json | 8 +++++ docs/source/conf.py | 36 ++++++--------------- requirements/requirements_docs.txt | 19 ++++------- 6 files changed, 151 insertions(+), 40 deletions(-) create mode 100644 docs/README.md create mode 100755 docs/check_for_broken_links.sh create mode 100644 docs/false_positives.json diff --git a/.github/workflows/gh-docs.yml b/.github/workflows/gh-docs.yml index 6f8e8ea1e3e1..fb4c546b2e10 100644 --- a/.github/workflows/gh-docs.yml +++ b/.github/workflows/gh-docs.yml @@ -8,6 +8,9 @@ on: # Set the access for individual scopes permissions: write-all +env: + PYTHON_VERSION: "3.11" + jobs: deploy: runs-on: ubuntu-latest @@ -16,7 +19,7 @@ jobs: image: squidfunk/mkdocs-material steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 if: github.event.repository.fork == false with: ref: gh-pages-src @@ -36,3 +39,25 @@ jobs: continue-on-error: true run: mkdocs gh-deploy --force + linkcheck: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + - name: Install Sphinx dependencies + run: python3 -m pip install -r requirements/requirements_docs.txt + - name: Linkcheck docs build + run: make -C docs linkcheck || true + - name: Eliminate false positives + run: ./docs/check_for_broken_links.sh || true + - name: Upload linkcheck output + uses: actions/upload-artifact@v4 + with: + name: linkcheck-artifact + path: docs/build/linkcheck + if-no-files-found: error + retention-days: 7 diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000000..1f0b3256359c --- /dev/null +++ b/docs/README.md @@ -0,0 +1,51 @@ +# Documentation Process for NeMo + +## Building the Documentation + +1. Create and activate a virtual environment. + +1. Install the documentation dependencies: + + ```console + $ python3 -m pip install -r requirements/requirements_docs.txt + ``` + +1. Build the documentation: + + ```console + $ make -C docs html + ``` + +## Checking for Broken Links + +1. Build the documentation, as described in the preceding section, but use the following command: + + ```shell + make -C docs clean linkcheck + ``` + +1. Run the link-checking script: + + ```shell + ./docs/check_for_broken_links.sh + ``` + +If there are no broken links, then the script exits with `0`. + +If the script produces any output, cut and paste the `uri` value into your browser to confirm +that the link is broken. + +```json +{ + "filename": "nlp/text_normalization/nn_text_normalization.rst", + "lineno": 247, + "status": "broken", + "code": 0, + "uri": "https://research.fb.com/wp-content/uploads/2019/03/Neural-Models-of-Text-Normalization-for-Speech-Applications.pdf", + "info": "400 Client Error: Bad Request for url: https://research.facebook.com/wp-content/uploads/2019/03/Neural-Models-of-Text-Normalization-for-Speech-Applications.pdf" +} +``` + +If the link is OK, and this is the case with many URLs that reference GitHub repository file headings, +then cut and paste the JSON output and add it to `docs/false_positives.json`. +Run the script again to confirm that the URL is no longer reported as a broken link. diff --git a/docs/check_for_broken_links.sh b/docs/check_for_broken_links.sh new file mode 100755 index 000000000000..79976896967f --- /dev/null +++ b/docs/check_for_broken_links.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +DOCS_DIR=$(dirname "${BASH_SOURCE[0]}") +FALSE_POSITIVES_JSON="${DOCS_DIR}/false_positives.json" +LINKCHECK_JSON="${DOCS_DIR}/build/linkcheck/output.json" + +function check_environment { + local err=0 + if ! [ -x "$(command -v jq)" ]; then + >&2 echo "jq is required but is not found." + ((err++)) + fi + if [ ! -f "${FALSE_POSITIVES_JSON}" ]; then + >&2 echo "A JSON file with false positives is required: ${FALSE_POSITIVES_JSON}" + ((err++)) + fi + if [ ! -f "${LINKCHECK_JSON}" ]; then + >&2 echo "Did not find linkcheck output JSON file: ${LINKCHECK_JSON}." + >&2 echo "Run Sphinx with the linkcheck arg: make -C docs clean linkcheck" + ((err++)) + fi + if [ "${err}" -gt 0 ]; then + exit 2 + fi +} + +function check_links { + local err=0 + # If you know how to prevent the hack with using jq twice, lmk. + broken=$(jq 'select(.status == "broken")' "${LINKCHECK_JSON}" | jq -s) + count=$(echo "${broken}" | jq 'length') + for i in $(seq 0 $(($count - 1))) + do + entry=$(echo "${broken}" | jq ".[${i}]") + link=$(echo "${entry}" | jq -r '.uri') + [ -n "${DEBUG}" ] && { + echo >&2 "Checking for false positive: ${link}" + } + local resp; resp=$(jq --arg check "${link}" -s 'any(.uri == $check)' < "${FALSE_POSITIVES_JSON}") + # "false" indicates that the URL did not match any of the URIs in the false positive file. + if [ "false" = "${resp}" ]; then + ((err++)) + echo "${entry}" + fi + done + exit "${err}" +} + +check_environment +check_links diff --git a/docs/false_positives.json b/docs/false_positives.json new file mode 100644 index 000000000000..7f461c0eb005 --- /dev/null +++ b/docs/false_positives.json @@ -0,0 +1,8 @@ +{ + "filename": "nlp/text_normalization/nn_text_normalization.rst", + "lineno": 247, + "status": "broken", + "code": 0, + "uri": "https://research.fb.com/wp-content/uploads/2019/03/Neural-Models-of-Text-Normalization-for-Speech-Applications.pdf", + "info": "400 Client Error: Bad Request for url: https://research.facebook.com/wp-content/uploads/2019/03/Neural-Models-of-Text-Normalization-for-Speech-Applications.pdf" +} diff --git a/docs/source/conf.py b/docs/source/conf.py index be0f9f933851..4481870ea132 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -20,8 +20,6 @@ import sys import glob -import sphinx_book_theme - # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -120,7 +118,7 @@ # "sphinx.ext.autosectionlabel", "sphinxcontrib.bibtex", "sphinx_copybutton", - "sphinxext.opengraph", + # "sphinxext.opengraph", ] bibtex_bibfiles = [ @@ -178,16 +176,13 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "console" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [] -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = "default" - ### Previous NeMo theme # # NVIDIA theme settings. # html_theme = 'nvidia_theme' @@ -223,27 +218,14 @@ # html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] -html_theme = "sphinx_book_theme" -html_logo = os.path.join('nv_logo.png') +html_theme = "nvidia_sphinx_theme" +html_copy_source = False +html_show_sourcelink = False +html_show_sphinx = False html_title = 'NVIDIA NeMo' html_theme_options = { - 'logo_only': False, - 'display_version': True, - # 'prev_next_buttons_location': 'bottom', - # 'style_external_links': False, - # 'style_nav_header_background': '#000000', - # Toc options - 'collapse_navigation': False, - # 'sticky_navigation': False, - 'navigation_depth': 10, - # 'includehidden': False, - # 'titles_only': False, - # Sphinx Book theme, - 'repository_url': 'https://github.com/NVIDIA/NeMo', - 'use_repository_button': True, - 'show_navbar_depth': 1, - 'show_toc_level': 10, + "icon_links": [], } @@ -272,8 +254,8 @@ def setup(app): # ] # OpenGraph settings -ogp_site_url = 'https://nvidia.github.io/NeMo/' -ogp_image = 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/_static/nv_logo.png' +# ogp_site_url = 'https://nvidia.github.io/NeMo/' +# ogp_image = 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/_static/nv_logo.png' # MathJax CDN # follow recommendation here https://www.sphinx-doc.org/en/master/usage/extensions/math.html#module-sphinx.ext.mathjax diff --git a/requirements/requirements_docs.txt b/requirements/requirements_docs.txt index ff3ec5202b0e..a2d2f3301fe9 100644 --- a/requirements/requirements_docs.txt +++ b/requirements/requirements_docs.txt @@ -1,12 +1,7 @@ -boto3 -Jinja2 -latexcodec -numpy -pydata-sphinx-theme -Sphinx -sphinx-book-theme -sphinx-copybutton -sphinxcontrib-bibtex -sphinxext-opengraph -urllib3 -wrapt +myst-parser<5 +nvidia-sphinx-theme +sphinx<7.5 +sphinx-copybutton<=0.6 +sphinx-reredirects<0.2 +sphinxcontrib-bibtex<2.7 +toml==0.10.2