diff --git a/docs/changelog.md b/docs/changelog.md index 86b3b5fd..25476435 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -10,6 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [unreleased] +### Changed + +#### Refactor `abbr` Extension + +A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated +`AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists, +avoiding a conflict between the two extensions (#1460). + ### Fixed * Fixed links to source code on GitHub from the documentation (#1453). diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py index 1c7185b2..6dbfcf51 100644 --- a/markdown/extensions/abbr.py +++ b/markdown/extensions/abbr.py @@ -25,7 +25,8 @@ from . import Extension from ..blockprocessors import BlockProcessor from ..inlinepatterns import InlineProcessor -from ..util import AtomicString +from ..treeprocessors import Treeprocessor +from ..util import AtomicString, deprecated import re import xml.etree.ElementTree as etree @@ -34,22 +35,71 @@ class AbbrExtension(Extension): """ Abbreviation Extension for Python-Markdown. """ def extendMarkdown(self, md): - """ Insert `AbbrPreprocessor` before `ReferencePreprocessor`. """ - md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16) - - -class AbbrPreprocessor(BlockProcessor): - """ Abbreviation Preprocessor - parse text for abbr references. """ + """ Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """ + treeprocessor = AbbrTreeprocessor(md) + md.treeprocessors.register(treeprocessor, 'abbr', 7) + md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, treeprocessor.abbrs), 'abbr', 16) + + +class AbbrTreeprocessor(Treeprocessor): + """ Replace abbr text with `` elements. """ + + def __init__(self, md: Markdown | None=None): + self.abbrs = {} + self.RE = None + super().__init__(md) + + def iter_element(self, el, parent=None): + ''' Resursively iterate over elements, run regex on text and wrap matches in `abbr` tags. ''' + for child in reversed(el): + self.iter_element(child, el) + if text := el.text: + for m in reversed(list(self.RE.finditer(text))): + abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]}) + abbr.text = AtomicString(m.group(0)) + abbr.tail = text[m.end():] + el.insert(0, abbr) + text = text[:m.start()] + el.text = text + if parent and el.tail: + tail = el.tail + index = list(parent).index(el) + 1 + for m in reversed(list(self.RE.finditer(tail))): + abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]}) + abbr.text = AtomicString(m.group(0)) + abbr.tail = tail[m.end():] + parent.insert(index, abbr) + tail = tail[:m.start()] + el.tail = tail + + def run(self, root: etree.Element) -> etree.Element | None: + ''' Step through tree to find known abbreviations. ''' + if not self.abbrs: + # No abbrs defined. Skip running processor. + return + # Build and compile regex + self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs.keys()) })\\b") + # Step through tree and modify on matches + self.iter_element(root) + return + + +class AbbrBlockprocessor(BlockProcessor): + """ Abbreviation Blockprocessor - parse text for abbr references. """ RE = re.compile(r'^[*]\[(?P[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P.*)$', re.MULTILINE) + def __init__(self, parser, abbrs): + self.abbrs = abbrs + super().__init__(parser) + def test(self, parent: etree.Element, block: str) -> bool: return True def run(self, parent: etree.Element, blocks: list[str]) -> bool: """ Find and remove all Abbreviation references from the text. - Each reference is set as a new `AbbrPattern` in the markdown instance. + Each reference is added to the abbrs collection. """ block = blocks.pop(0) @@ -57,9 +107,7 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: if m: abbr = m.group('abbr').strip() title = m.group('title').strip() - self.parser.md.inlinePatterns.register( - AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2 - ) + self.abbrs[abbr] = title if block[m.end():].strip(): # Add any content after match back to blocks as separate block blocks.insert(0, block[m.end():].lstrip('\n')) @@ -71,11 +119,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool: blocks.insert(0, block) return False - def _generate_pattern(self, text: str) -> str: - """ Given a string, returns a regex pattern to match that string. """ - return f"(?P<abbr>\\b{ re.escape(text) }\\b)" - +@deprecated("This class will be removed in the future; use `AbbrTreeprocessor` instead.") class AbbrInlineProcessor(InlineProcessor): """ Abbreviation inline pattern. """ diff --git a/tests/test_syntax/extensions/test_abbr.py b/tests/test_syntax/extensions/test_abbr.py index e11e8d30..61021368 100644 --- a/tests/test_syntax/extensions/test_abbr.py +++ b/tests/test_syntax/extensions/test_abbr.py @@ -60,7 +60,7 @@ def test_abbr_lower(self): ) ) - def test_abbr_multiple(self): + def test_abbr_multiple_in_text(self): self.assertMarkdownRenders( self.dedent( """ @@ -79,6 +79,44 @@ def test_abbr_multiple(self): ) ) + def test_abbr_multiple_in_tail(self): + self.assertMarkdownRenders( + self.dedent( + """ + *The* HTML specification + is maintained by the W3C. + + *[HTML]: Hyper Text Markup Language + *[W3C]: World Wide Web Consortium + """ + ), + self.dedent( + """ + <p><em>The</em> <abbr title="Hyper Text Markup Language">HTML</abbr> specification + is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p> + """ + ) + ) + + def test_abbr_multiple_nested(self): + self.assertMarkdownRenders( + self.dedent( + """ + The *HTML* specification + is maintained by the *W3C*. + + *[HTML]: Hyper Text Markup Language + *[W3C]: World Wide Web Consortium + """ + ), + self.dedent( + """ + <p>The <em><abbr title="Hyper Text Markup Language">HTML</abbr></em> specification + is maintained by the <em><abbr title="World Wide Web Consortium">W3C</abbr></em>.</p> + """ + ) + ) + def test_abbr_override(self): self.assertMarkdownRenders( self.dedent( @@ -325,3 +363,20 @@ def test_abbr_bracket(self): """ ) ) + + def test_abbr_with_attr_list(self): + self.assertMarkdownRenders( + self.dedent( + """ + *[abbr]: Abbreviation Definition + + ![Image with abbr in title](abbr.png){title="Image with abbr in title"} + """ + ), + self.dedent( + """ + <p><img alt="Image with abbr in title" src="abbr.png" title="Image with abbr in title" /></p> + """ + ), + extensions = ['abbr', 'attr_list'] + )