Refactor abbr Extension

A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated `AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists, avoiding a conflict between the two extensions. Fixes #1460.
Python-Markdown · Apr 24, 2024 · c03724b · c03724b
1 parent 993b57b
commit c03724b
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 16 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -10,6 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [unreleased]
 
+### Changed
+
+#### Refactor `abbr` Extension
+
+A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated
+`AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists,
+avoiding a conflict between the two extensions (#1460).
+
 ### Fixed
 
 * Fixed links to source code on GitHub from the documentation (#1453).

diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py
@@ -25,7 +25,8 @@
 from . import Extension
 from ..blockprocessors import BlockProcessor
 from ..inlinepatterns import InlineProcessor
-from ..util import AtomicString
+from ..treeprocessors import Treeprocessor
+from ..util import AtomicString, deprecated
 import re
 import xml.etree.ElementTree as etree
 
@@ -34,32 +35,79 @@ class AbbrExtension(Extension):
     """ Abbreviation Extension for Python-Markdown. """
 
     def extendMarkdown(self, md):
-        """ Insert `AbbrPreprocessor` before `ReferencePreprocessor`. """
-        md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16)
-
-
-class AbbrPreprocessor(BlockProcessor):
-    """ Abbreviation Preprocessor - parse text for abbr references. """
+        """ Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
+        treeprocessor = AbbrTreeprocessor(md)
+        md.treeprocessors.register(treeprocessor, 'abbr', 7)
+        md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, treeprocessor.abbrs), 'abbr', 16)
+
+
+class AbbrTreeprocessor(Treeprocessor):
+    """ Replace abbr text with `<abbr>` elements. """
+
+    def __init__(self, md: Markdown | None=None):
+        self.abbrs = {}
+        self.RE = None
+        super().__init__(md)
+
+    def iter_element(self, el, parent=None):
+        ''' Resursively iterate over elements, run regex on text and wrap matches in `abbr` tags. '''
+        for child in reversed(el):
+            self.iter_element(child, el)
+        if text := el.text:
+            for m in reversed(list(self.RE.finditer(text))):
+                abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
+                abbr.text = AtomicString(m.group(0))
+                abbr.tail = text[m.end():]
+                el.insert(0, abbr)
+                text = text[:m.start()]
+            el.text = text
+        if parent and el.tail:
+            tail = el.tail
+            index = list(parent).index(el) + 1
+            for m in reversed(list(self.RE.finditer(tail))):
+                abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
+                abbr.text = AtomicString(m.group(0))
+                abbr.tail = tail[m.end():]
+                parent.insert(index, abbr)
+                tail = tail[:m.start()]
+            el.tail = tail
+
+    def run(self, root: etree.Element) -> etree.Element | None:
+        ''' Step through tree to find known abbreviations. '''
+        if not self.abbrs:
+            # No abbrs defined. Skip running processor.
+            return
+        # Build and compile regex
+        self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs.keys()) })\\b")
+        # Step through tree and modify on matches
+        self.iter_element(root)
+        return
+
+
+class AbbrBlockprocessor(BlockProcessor):
+    """ Abbreviation Blockprocessor - parse text for abbr references. """
 
     RE = re.compile(r'^[*]\[(?P<abbr>[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE)
 
+    def __init__(self, parser, abbrs):
+        self.abbrs = abbrs
+        super().__init__(parser)
+
     def test(self, parent: etree.Element, block: str) -> bool:
         return True
 
     def run(self, parent: etree.Element, blocks: list[str]) -> bool:
         """
         Find and remove all Abbreviation references from the text.
-        Each reference is set as a new `AbbrPattern` in the markdown instance.
+        Each reference is added to the abbrs collection.
 
         """
         block = blocks.pop(0)
         m = self.RE.search(block)
         if m:
             abbr = m.group('abbr').strip()
             title = m.group('title').strip()
-            self.parser.md.inlinePatterns.register(
-                AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
-            )
+            self.abbrs[abbr] = title
             if block[m.end():].strip():
                 # Add any content after match back to blocks as separate block
                 blocks.insert(0, block[m.end():].lstrip('\n'))
@@ -71,11 +119,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
         blocks.insert(0, block)
         return False
 
-    def _generate_pattern(self, text: str) -> str:
-        """ Given a string, returns a regex pattern to match that string. """
-        return f"(?P<abbr>\\b{ re.escape(text) }\\b)"
-
 
+@deprecated("This class will be removed in the future; use `AbbrTreeprocessor` instead.")
 class AbbrInlineProcessor(InlineProcessor):
     """ Abbreviation inline pattern. """
 

diff --git a/tests/test_syntax/extensions/test_abbr.py b/tests/test_syntax/extensions/test_abbr.py
@@ -60,7 +60,7 @@ def test_abbr_lower(self):
             )
         )
 
-    def test_abbr_multiple(self):
+    def test_abbr_multiple_in_text(self):
         self.assertMarkdownRenders(
             self.dedent(
                 """
@@ -79,6 +79,44 @@ def test_abbr_multiple(self):
             )
         )
 
+    def test_abbr_multiple_in_tail(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *The* HTML specification
+                is maintained by the W3C.
+
+                *[HTML]: Hyper Text Markup Language
+                *[W3C]:  World Wide Web Consortium
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>The</em> <abbr title="Hyper Text Markup Language">HTML</abbr> specification
+                is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p>
+                """
+            )
+        )
+
+    def test_abbr_multiple_nested(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                The *HTML* specification
+                is maintained by the *W3C*.
+
+                *[HTML]: Hyper Text Markup Language
+                *[W3C]:  World Wide Web Consortium
+                """
+            ),
+            self.dedent(
+                """
+                <p>The <em><abbr title="Hyper Text Markup Language">HTML</abbr></em> specification
+                is maintained by the <em><abbr title="World Wide Web Consortium">W3C</abbr></em>.</p>
+                """
+            )
+        )
+
     def test_abbr_override(self):
         self.assertMarkdownRenders(
             self.dedent(
@@ -325,3 +363,20 @@ def test_abbr_bracket(self):
                 """
             )
         )
+
+    def test_abbr_with_attr_list(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *[abbr]: Abbreviation Definition
+
+                ![Image with abbr in title](abbr.png){title="Image with abbr in title"}
+                """
+            ),
+            self.dedent(
+                """
+                <p><img alt="Image with abbr in title" src="abbr.png" title="Image with abbr in title" /></p>
+                """
+            ),
+            extensions = ['abbr', 'attr_list']
+        )