Added a glossary option for the abbr extension

The glossary file also uses the Markdown abbreviation syntax (`AbbrBlockprocessor` is used to process the file) and keeps the glossary definitions separate from the page definitions, allowing the glossary to be applied to every page while only being processed once.
Python-Markdown · Jun 3, 2024 · 2009c02 · 2009c02
1 parent 9fed7aa
commit 2009c02
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 8 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -27,16 +27,18 @@ Abbreviations are now sorted by length before executing `AbbrTreeprocessor`
 to ensure that multi-word abbreviations are implemented even if an abbreviation
 exists for one of those component words. (#1465)
 
-Added an optional `use_last_abbr` configuration option to the abbreviations
-extension. Default (`True`) maintains the existing behavior. `False` causes
-the extension to only use the first instance of an abbreviation, rather than
-the last.
-
 Empty abbreviations are now skipped by `AbbrTreeprocessor`. This avoids applying
 abbr tags to text without a title value. This also allows disabling an
 abbreviation, which may be useful for documents that uses two terms with
 identical abbreviations.
 
+Added an optional `glossary` configuration option to the abbreviations extension.
+This provides a simple and efficient way to apply abbreviations to every page.
+
+Added an optional `use_last_abbr` configuration option to the abbreviations
+extension. Default (`True`) maintains the existing behavior. `False` causes
+the extension to only use the first instance of an abbreviation, rather than
+the last. 
 
 
 ### Fixed

diff --git a/docs/extensions/abbreviations.md b/docs/extensions/abbreviations.md
@@ -51,7 +51,20 @@ Usage
 See [Extensions](index.md) for general extension usage. Use `abbr` as the name
 of the extension.
 
-This extension does not accept any special configuration options.
+The following options are provided to configure the output:
+
+* **`use_last_abbr`**:
+    `True` to use the last instance of an abbreviation, rather than the first instance.
+
+    This is useful when auto-appending glossary files to pages while still wanting the page's
+    abbreviations to take precedence. Not recommended for use with the `glossary` option.
+
+* **`glossary`**:
+    Path to a Markdown file containing abbreviations to be applied to every page.
+
+    The abbreviations from this file will be the default abbreviations applied to every page with
+    abbreviations defined on the page taking precedence (unless also using `use_last_abbr`). The
+    glossary syntax should use the same Markdown syntax described on this page.
 
 A trivial example:
 

diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py
@@ -22,6 +22,7 @@
 
 from __future__ import annotations
 
+import codecs
 from . import Extension
 from ..util import parseBoolValue
 from ..blockprocessors import BlockProcessor
@@ -48,17 +49,46 @@ def __init__(self, **kwargs):
                 'True to use the last instance of an abbreviation, rather than the first instance.'
                 'Default: `True`.'
             ],
+            'glossary': [
+                '',
+                'Path to the Markdown file containing abbreviations to be applied to every page.'
+                "Default: `''`"
+            ],
         }
         """ Default configuration options. """
         super().__init__(**kwargs)
         self.abbrs = {}
+        self.glossary = {}
 
     def reset(self):
         """ Clear all previously defined abbreviations. """
         self.abbrs.clear()
+        if (self.glossary):
+            self.abbrs.update(self.glossary)
+
+    def load_glossary(self, md: Markdown, filename: str):
+        if filename and isinstance(filename, str):
+            input_file = codecs.open(filename, mode="r", encoding='utf-8')
+            text = input_file.read()
+            input_file.close()
+            text = text.lstrip('\ufeff')  # remove the byte-order mark
+            try:
+                text = str(text)
+            except UnicodeDecodeError as e:  # pragma: no cover
+                # Customize error message while maintaining original traceback
+                e.reason += '. -- Note: Markdown only accepts Unicode input!'
+                raise
+            lines = text.split("\n")
+
+            bp = AbbrBlockprocessor(md.parser, self.glossary, self.getConfigs())
+            for line in lines:
+                    bp.run(None, [line])
 
     def extendMarkdown(self, md):
         """ Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
+        if (self.config['glossary'][0]):
+            self.load_glossary(md, self.config['glossary'][0])
+        self.abbrs.update(self.glossary)
         md.registerExtension(self)
         md.treeprocessors.register(AbbrTreeprocessor(md, self.abbrs), 'abbr', 7)
         md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, self.abbrs, self.getConfigs()), 'abbr', 16)
@@ -85,7 +115,7 @@ def iter_element(self, el: etree.Element, parent: etree.Element | None = None) -
                     el.insert(0, abbr)
                     text = text[:m.start()]
             el.text = text
-        if parent and el.tail:
+        if parent is not None and el.tail:
             tail = el.tail
             index = list(parent).index(el) + 1
             for m in reversed(list(self.RE.finditer(tail))):

diff --git a/tests/test_syntax/extensions/test_abbr.py b/tests/test_syntax/extensions/test_abbr.py
@@ -20,11 +20,13 @@
 License: BSD (see LICENSE.md for details).
 """
 
+import os
+from tempfile import mkstemp
+import atexit
 from markdown.test_tools import TestCase
 from markdown import Markdown
 from markdown.extensions.abbr import AbbrExtension
 
-
 class TestAbbr(TestCase):
     maxDiff = None
 
@@ -155,6 +157,40 @@ def test_abbr_override_Ignored(self):
             extensions=[AbbrExtension(use_last_abbr=False)]
         )
 
+    def test_abbr_glossary(self):
+        # Create temporary glossary file and set a trigger to guarantee it is deleted even if this test fails
+        temp_file, glossary_file = mkstemp(suffix='.md')
+        os.close(temp_file)
+        cleanup_trigger = atexit.register(os.remove, glossary_file)
+
+        with open(glossary_file, 'w', encoding='utf-8') as temp_file:
+            temp_file.writelines([
+                "*[ABBR]: Abbreviation\n",
+                "*[abbr]: Abbreviation\n",
+                "*[HTML]: Hyper Text Markup Language\n",
+                "*[W3C]:  World Wide Web Consortium\n"
+            ])
+
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                ABBR abbr
+                
+                HTML W3C
+                """
+            ),
+            self.dedent(
+                """
+                <p><abbr title="Abbreviation">ABBR</abbr> <abbr title="Abbreviation">abbr</abbr></p>
+                <p><abbr title="Hyper Text Markup Language">HTML</abbr> <abbr title="World Wide Web Consortium">W3C</abbr></p>
+                """
+            ),
+            extensions=[AbbrExtension(glossary=glossary_file)]
+        )
+        # cleanup
+        os.remove(glossary_file)
+        atexit.unregister(cleanup_trigger)
+
     def test_abbr_nested(self):
         self.assertMarkdownRenders(
             self.dedent(