Skip to content

Commit

Permalink
Refactor abbr Extension
Browse files Browse the repository at this point in the history
A new `AbbrTreeprocessor` has been introduced, which replaces the now
deprecated `AbbrInlineProcessor`. Abbreviation processing now happens
after Attribute Lists, avoiding a conflict between the two extensions.
Fixes #1460.
  • Loading branch information
waylan committed Apr 24, 2024
1 parent 993b57b commit c03724b
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 16 deletions.
8 changes: 8 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [unreleased]

### Changed

#### Refactor `abbr` Extension

A new `AbbrTreeprocessor` has been introduced, which replaces the now deprecated
`AbbrInlineProcessor`. Abbreviation processing now happens after Attribute Lists,
avoiding a conflict between the two extensions (#1460).

### Fixed

* Fixed links to source code on GitHub from the documentation (#1453).
Expand Down
75 changes: 60 additions & 15 deletions markdown/extensions/abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
from . import Extension
from ..blockprocessors import BlockProcessor
from ..inlinepatterns import InlineProcessor
from ..util import AtomicString
from ..treeprocessors import Treeprocessor
from ..util import AtomicString, deprecated
import re
import xml.etree.ElementTree as etree

Expand All @@ -34,32 +35,79 @@ class AbbrExtension(Extension):
""" Abbreviation Extension for Python-Markdown. """

def extendMarkdown(self, md):
""" Insert `AbbrPreprocessor` before `ReferencePreprocessor`. """
md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16)


class AbbrPreprocessor(BlockProcessor):
""" Abbreviation Preprocessor - parse text for abbr references. """
""" Insert `AbbrTreeprocessor` and `AbbrBlockprocessor`. """
treeprocessor = AbbrTreeprocessor(md)
md.treeprocessors.register(treeprocessor, 'abbr', 7)
md.parser.blockprocessors.register(AbbrBlockprocessor(md.parser, treeprocessor.abbrs), 'abbr', 16)


class AbbrTreeprocessor(Treeprocessor):
""" Replace abbr text with `<abbr>` elements. """

def __init__(self, md: Markdown | None=None):
self.abbrs = {}
self.RE = None
super().__init__(md)

def iter_element(self, el, parent=None):
''' Resursively iterate over elements, run regex on text and wrap matches in `abbr` tags. '''
for child in reversed(el):
self.iter_element(child, el)
if text := el.text:
for m in reversed(list(self.RE.finditer(text))):
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = text[m.end():]
el.insert(0, abbr)
text = text[:m.start()]
el.text = text
if parent and el.tail:
tail = el.tail
index = list(parent).index(el) + 1
for m in reversed(list(self.RE.finditer(tail))):
abbr = etree.Element('abbr', {'title': self.abbrs[m.group(0)]})
abbr.text = AtomicString(m.group(0))
abbr.tail = tail[m.end():]
parent.insert(index, abbr)
tail = tail[:m.start()]
el.tail = tail

def run(self, root: etree.Element) -> etree.Element | None:
''' Step through tree to find known abbreviations. '''
if not self.abbrs:
# No abbrs defined. Skip running processor.
return
# Build and compile regex
self.RE = re.compile(f"\\b(?:{ '|'.join(re.escape(key) for key in self.abbrs.keys()) })\\b")
# Step through tree and modify on matches
self.iter_element(root)
return


class AbbrBlockprocessor(BlockProcessor):
""" Abbreviation Blockprocessor - parse text for abbr references. """

RE = re.compile(r'^[*]\[(?P<abbr>[^\\]*?)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE)

def __init__(self, parser, abbrs):
self.abbrs = abbrs
super().__init__(parser)

def test(self, parent: etree.Element, block: str) -> bool:
return True

def run(self, parent: etree.Element, blocks: list[str]) -> bool:
"""
Find and remove all Abbreviation references from the text.
Each reference is set as a new `AbbrPattern` in the markdown instance.
Each reference is added to the abbrs collection.
"""
block = blocks.pop(0)
m = self.RE.search(block)
if m:
abbr = m.group('abbr').strip()
title = m.group('title').strip()
self.parser.md.inlinePatterns.register(
AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
)
self.abbrs[abbr] = title
if block[m.end():].strip():
# Add any content after match back to blocks as separate block
blocks.insert(0, block[m.end():].lstrip('\n'))
Expand All @@ -71,11 +119,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
blocks.insert(0, block)
return False

def _generate_pattern(self, text: str) -> str:
""" Given a string, returns a regex pattern to match that string. """
return f"(?P<abbr>\\b{ re.escape(text) }\\b)"


@deprecated("This class will be removed in the future; use `AbbrTreeprocessor` instead.")
class AbbrInlineProcessor(InlineProcessor):
""" Abbreviation inline pattern. """

Expand Down
57 changes: 56 additions & 1 deletion tests/test_syntax/extensions/test_abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_abbr_lower(self):
)
)

def test_abbr_multiple(self):
def test_abbr_multiple_in_text(self):
self.assertMarkdownRenders(
self.dedent(
"""
Expand All @@ -79,6 +79,44 @@ def test_abbr_multiple(self):
)
)

def test_abbr_multiple_in_tail(self):
self.assertMarkdownRenders(
self.dedent(
"""
*The* HTML specification
is maintained by the W3C.
*[HTML]: Hyper Text Markup Language
*[W3C]: World Wide Web Consortium
"""
),
self.dedent(
"""
<p><em>The</em> <abbr title="Hyper Text Markup Language">HTML</abbr> specification
is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p>
"""
)
)

def test_abbr_multiple_nested(self):
self.assertMarkdownRenders(
self.dedent(
"""
The *HTML* specification
is maintained by the *W3C*.
*[HTML]: Hyper Text Markup Language
*[W3C]: World Wide Web Consortium
"""
),
self.dedent(
"""
<p>The <em><abbr title="Hyper Text Markup Language">HTML</abbr></em> specification
is maintained by the <em><abbr title="World Wide Web Consortium">W3C</abbr></em>.</p>
"""
)
)

def test_abbr_override(self):
self.assertMarkdownRenders(
self.dedent(
Expand Down Expand Up @@ -325,3 +363,20 @@ def test_abbr_bracket(self):
"""
)
)

def test_abbr_with_attr_list(self):
self.assertMarkdownRenders(
self.dedent(
"""
*[abbr]: Abbreviation Definition
![Image with abbr in title](abbr.png){title="Image with abbr in title"}
"""
),
self.dedent(
"""
<p><img alt="Image with abbr in title" src="abbr.png" title="Image with abbr in title" /></p>
"""
),
extensions = ['abbr', 'attr_list']
)

0 comments on commit c03724b

Please sign in to comment.