Skip to content

Commit d241477

Browse files
committed
Adjust md_in_html "markdown" blocks to process content consistently
Ensure `md_in_html` processes content inside a "markdown" block the same way content is processed outside of a "markdown" block. - Flatten the HTML content into placeholders so that the parser will treat the "markdown" block content in the same way it does when `md_in_html` is not enabled. The placeholders are expanded once the parser reaches them in a linear fashion. This allows extensions to deal with HTML content and consume it the same way it deals with them with them when the content is not nested under a "markdown" block. - Instead of content being processed in dummy tags, content is now processed under the real parent allowing extensions to have better context to make better decisions.
1 parent 4260e7b commit d241477

File tree

1 file changed

+73
-43
lines changed

1 file changed

+73
-43
lines changed

markdown/extensions/md_in_html.py

Lines changed: 73 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,36 @@ def handle_endtag(self, tag):
163163
# If we only have one newline before block element, add another
164164
if not item.endswith('\n\n') and item.endswith('\n'):
165165
self.cleandoc.append('\n')
166+
167+
# Flatten the HTML structure of "markdown" blocks such that when they
168+
# get parsed, content will be parsed similar inside the blocks as it
169+
# does outside the block. Having real HTML elements in the tree before
170+
# the content adjacent content is processed can cause unpredictable
171+
# issues for extensions.
172+
current = element
173+
last = []
174+
while current is not None:
175+
for child in list(current):
176+
current.remove(child)
177+
text = current.text if current.text is not None else ''
178+
tail = child.tail if child.tail is not None else ''
179+
child.tail = None
180+
state = child.attrib.get('markdown', 'off')
181+
182+
# If the tail is just a new line, omit it.
183+
if tail == '\n':
184+
tail = ''
185+
186+
# Process the block nested under the spac appropriately
187+
if state in ('span', 'block'):
188+
current.text = text + '\n' + self.md.htmlStash.store(child) + '\n' + tail
189+
last.append(child)
190+
else:
191+
child.attrib.pop('markdown')
192+
[c.attrib.pop('markdown', None) for c in child.iter()]
193+
current.text = text + '\n' + self.md.htmlStash.store(child) + '\n' + tail
194+
current = last.pop(0) if last else None
195+
166196
self.cleandoc.append(self.md.htmlStash.store(element))
167197
self.cleandoc.append('\n\n')
168198
self.state = []
@@ -270,53 +300,53 @@ def parse_element_content(self, element: etree.Element) -> None:
270300
md_attr = element.attrib.pop('markdown', 'off')
271301

272302
if md_attr == 'block':
273-
# Parse content as block level
274-
# The order in which the different parts are parsed (text, children, tails) is important here as the
275-
# order of elements needs to be preserved. We can't be inserting items at a later point in the current
276-
# iteration as we don't want to do raw processing on elements created from parsing Markdown text (for
277-
# example). Therefore, the order of operations is children, tails, text.
278-
279-
# Recursively parse existing children from raw HTML
280-
for child in list(element):
281-
self.parse_element_content(child)
282-
283-
# Parse Markdown text in tail of children. Do this separate to avoid raw HTML parsing.
284-
# Save the position of each item to be inserted later in reverse.
285-
tails = []
286-
for pos, child in enumerate(element):
287-
if child.tail:
288-
block = child.tail.rstrip('\n')
289-
child.tail = ''
290-
# Use a dummy placeholder element.
291-
dummy = etree.Element('div')
292-
self.parser.parseBlocks(dummy, block.split('\n\n'))
293-
children = list(dummy)
294-
children.reverse()
295-
tails.append((pos + 1, children))
296-
297-
# Insert the elements created from the tails in reverse.
298-
tails.reverse()
299-
for pos, tail in tails:
300-
for item in tail:
301-
element.insert(pos, item)
302-
303-
# Parse Markdown text content. Do this last to avoid raw HTML parsing.
303+
# Parse the block elements content as Markdown
304304
if element.text:
305305
block = element.text.rstrip('\n')
306306
element.text = ''
307-
# Use a dummy placeholder element as the content needs to get inserted before existing children.
308-
dummy = etree.Element('div')
309-
self.parser.parseBlocks(dummy, block.split('\n\n'))
310-
children = list(dummy)
311-
children.reverse()
312-
for child in children:
313-
element.insert(0, child)
307+
self.parser.parseBlocks(element, block.split('\n\n'))
314308

315309
elif md_attr == 'span':
316-
# Span level parsing will be handled by inline processors.
317-
# Walk children here to remove any `markdown` attributes.
318-
for child in list(element):
319-
self.parse_element_content(child)
310+
# Span elements need to be recursively processed for block elements and raw HTML
311+
# as their content is not normally accessed by block processors, so expand stashed
312+
# HTML under the span. Span content itself will not be parsed here, but will await
313+
# the inline parser.
314+
block = element.text
315+
element.text = ''
316+
child = None
317+
start = 0
318+
319+
# Search the content for HTML placeholders and process the elements
320+
for m in util.HTML_PLACEHOLDER_RE.finditer(block):
321+
index = int(m.group(1))
322+
el = self.parser.md.htmlStash.rawHtmlBlocks[index]
323+
end = m.start()
324+
325+
# Cut out the placeholder and and insert the processed element back in.
326+
if isinstance(el, etree.Element):
327+
if child is None:
328+
element.text = block[start:end]
329+
else:
330+
child.tail = (child.tail if child.tail is not None else '') + block[start:end]
331+
element.append(el)
332+
self.parse_element_content(el)
333+
child = el
334+
self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
335+
self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')
336+
337+
else:
338+
# Not an element object, so insert content back into the element
339+
if child is None:
340+
element.text = block[start:end]
341+
else:
342+
child.tail = (child.tail if child.tail is not None else '')+ block[start:end]
343+
start = end
344+
345+
# Insert anything left after last element
346+
if child is None:
347+
element.text = block[start:]
348+
else:
349+
child.tail = (child.tail if child.tail is not None else '') + block[start:]
320350

321351
else:
322352
# Disable inline parsing for everything else
@@ -336,8 +366,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
336366
if isinstance(element, etree.Element):
337367
# We have a matched element. Process it.
338368
blocks.pop(0)
339-
self.parse_element_content(element)
340369
parent.append(element)
370+
self.parse_element_content(element)
341371
# Cleanup stash. Replace element with empty string to avoid confusing postprocessor.
342372
self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
343373
self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')

0 commit comments

Comments
 (0)