More thorough cleanup of input whitespace

This improves the markdownify logic for cleaning up input whitespace that has no semantic significance in HTML. This PR uses a branch based on that for matthewwithanm#150 (which in turn is based on that for matthewwithanm#120) to avoid conflicts with those fixes. The suggested order of merging is just first to merge matthewwithanm#120, then the rest of matthewwithanm#150, then the rest of this PR. Whitespace in HTML input isn't generally significant before or after block-level elements, or at the start of end of such an element other than `<pre>`. There is some limited logic in markdownify for removing it, (a) for whitespace-only nodes in conjunction with a limited list of elements (and with questionable logic that ony removes whitespace adjacent to such an element when also inside such an element) and (b) only for trailing whitespace, in certain places in relation to lists. Replace both those places with more thorough logic using a common list of block-level elements (which could be expanded more). In general, this reduces the number of unnecessary blank lines in output from markdownify (sometimes lines with just a newline, sometimes lines containing a space as well as that newline). There are open issues about cases where propagating such input whitespace to the output actually results in badly formed Markdown output (wrongly indented output), but matthewwithanm#120 (which this builds on) fixes those issues, sometimes leaving unnecessary lines with just a space on them in the output, which are dealt with fully by the present PR. There are a few testcases that are affected because they were relying on such whitespace for good output from bad HTML input that used `<p>` or `<blockquote>` inside header tags. To keep reasonable output in those cases of bad input now input whitespace adjacent to those two tags is ignored, make the `<p>` and `<blockquote>` output explicitly include leading and trailing spaces if `convert_as_inline`; such explicit spaces seem the best that can be done for such bad input. Given those fixes, all the remaining changes needed to the expectations of existing tests seem like improvements (removing useless spaces or newlines from the output).
jsm28 · Oct 3, 2024 · 340aecb · 340aecb
1 parent c2ffe46
commit 340aecb
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 32 deletions.
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -67,6 +67,23 @@ def _todict(obj):
     return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))
 
 
+def remove_whitespace_inside(el):
+    """Return to remove whitespace immediately inside a block-level element."""
+    if not el or not el.name:
+        return False
+    if html_heading_re.match(el.name) is not None:
+        return True
+    return el.name in ('p', 'blockquote',
+                       'ol', 'ul', 'li',
+                       'table', 'thead', 'tbody', 'tfoot',
+                       'tr', 'td', 'th')
+
+
+def remove_whitespace_outside(el):
+    """Return to remove whitespace immediately outside a block-level element."""
+    return remove_whitespace_inside(el) or (el and el.name == 'pre')
+
+
 class MarkdownConverter(object):
     class DefaultOptions:
         autolinks = True
@@ -120,27 +137,23 @@ def process_tag(self, node, convert_as_inline, children_only=False):
         if not children_only and (isHeading or isCell):
             convert_children_as_inline = True
 
-        # Remove whitespace-only textnodes in purely nested nodes
-        def is_nested_node(el):
-            return el and el.name in ['ol', 'ul', 'li',
-                                      'table', 'thead', 'tbody', 'tfoot',
-                                      'tr', 'td', 'th']
-
-        if is_nested_node(node):
-            for el in node.children:
-                # Only extract (remove) whitespace-only text node if any of the
-                # conditions is true:
-                # - el is the first element in its parent
-                # - el is the last element in its parent
-                # - el is adjacent to an nested node
-                can_extract = (not el.previous_sibling
-                               or not el.next_sibling
-                               or is_nested_node(el.previous_sibling)
-                               or is_nested_node(el.next_sibling))
-                if (isinstance(el, NavigableString)
-                        and six.text_type(el).strip() == ''
-                        and can_extract):
-                    el.extract()
+        # Remove whitespace-only textnodes just before, after or
+        # inside block-level elements.
+        remove_inside = remove_whitespace_inside(node)
+        for el in node.children:
+            # Only extract (remove) whitespace-only text node if any of the
+            # conditions is true:
+            # - el is the first element in its parent (block-level)
+            # - el is the last element in its parent (block-level)
+            # - el is adjacent to a block-level node
+            can_extract = (remove_inside and (not el.previous_sibling
+                                              or not el.next_sibling)
+                           or remove_whitespace_outside(el.previous_sibling)
+                           or remove_whitespace_outside(el.next_sibling))
+            if (isinstance(el, NavigableString)
+                    and six.text_type(el).strip() == ''
+                    and can_extract):
+                el.extract()
 
         # Convert the children first
         for el in node.children:
@@ -179,12 +192,16 @@ def process_text(self, el):
         if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
             text = self.escape(text)
 
-        # remove trailing whitespaces if any of the following condition is true:
-        # - current text node is the last node in li
-        # - current text node is followed by an embedded list
-        if (el.parent.name == 'li'
-                and (not el.next_sibling
-                     or el.next_sibling.name in ['ul', 'ol'])):
+        # remove leading whitespace at the start or just after a
+        # block-level element; remove traliing whitespace at the end
+        # or just before a block-level element.
+        if (remove_whitespace_outside(el.previous_sibling)
+                or (remove_whitespace_inside(el.parent)
+                    and not el.previous_sibling)):
+            text = text.lstrip()
+        if (remove_whitespace_outside(el.next_sibling)
+                or (remove_whitespace_inside(el.parent)
+                    and not el.next_sibling)):
             text = text.rstrip()
 
         return text
@@ -257,7 +274,7 @@ def convert_a(self, el, text, convert_as_inline):
     def convert_blockquote(self, el, text, convert_as_inline):
 
         if convert_as_inline:
-            return text
+            return ' ' + text.strip() + ' '
 
         return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''
 
@@ -355,7 +372,7 @@ def convert_li(self, el, text, convert_as_inline):
 
     def convert_p(self, el, text, convert_as_inline):
         if convert_as_inline:
-            return text
+            return ' ' + text.strip() + ' '
         if self.options['wrap']:
             # Preserve newlines (and preceding whitespace) resulting
             # from <br> tags.  Newlines in the input have already been

diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -66,7 +66,7 @@ def test_blockquote_with_paragraph():
 
 def test_blockquote_nested():
     text = md('<blockquote>And she was like <blockquote>Hello</blockquote></blockquote>')
-    assert text == '\n> And she was like \n> > Hello\n\n'
+    assert text == '\n> And she was like\n> > Hello\n\n'
 
 
 def test_br():
@@ -136,7 +136,7 @@ def test_hn():
 
 
 def test_hn_chained():
-    assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n# First\n\n\n## Second\n\n\n### Third\n\n'
+    assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n# First\n\n## Second\n\n### Third\n\n'
     assert md('X<h1>First</h1>', heading_style=ATX) == 'X\n# First\n\n'
     assert md('X<h1>First</h1>', heading_style=ATX_CLOSED) == 'X\n# First #\n\n'
     assert md('X<h1>First</h1>') == 'X\n\nFirst\n=====\n\n'
@@ -196,7 +196,7 @@ def test_head():
 def test_hr():
     assert md('Hello<hr>World') == 'Hello\n\n---\n\nWorld'
     assert md('Hello<hr />World') == 'Hello\n\n---\n\nWorld'
-    assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n\n---\n\n\nWorld\n\n'
+    assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n---\n\nWorld\n\n'
 
 
 def test_i():
@@ -303,3 +303,13 @@ def callback(el):
     assert md('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=callback) == '\n```python\ntest\n    foo\nbar\n```\n'
     assert md('<pre class="javascript"><code>test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n    foo\nbar\n```\n'
     assert md('<pre class="javascript"><code class="javascript">test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n    foo\nbar\n```\n'
+
+
+def test_spaces():
+    assert md('<p> a b </p> <p> c d </p>') == '\n\na b\n\nc d\n\n'
+    assert md('<p> <i>a</i> </p>') == '\n\n*a*\n\n'
+    assert md('test <p> again </p>') == 'test\n\nagain\n\n'
+    assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
+    assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
+    assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
+    assert md('test <pre> foo </pre> bar') == 'test\n```\n foo \n```\nbar'