diff --git a/markdownify/__init__.py b/markdownify/__init__.py index a37f870..dd2507d 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -67,6 +67,23 @@ def _todict(obj): return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_')) +def remove_whitespace_inside(el): + """Return to remove whitespace immediately inside a block-level element.""" + if not el or not el.name: + return False + if html_heading_re.match(el.name) is not None: + return True + return el.name in ('p', 'blockquote', + 'ol', 'ul', 'li', + 'table', 'thead', 'tbody', 'tfoot', + 'tr', 'td', 'th') + + +def remove_whitespace_outside(el): + """Return to remove whitespace immediately outside a block-level element.""" + return remove_whitespace_inside(el) or (el and el.name == 'pre') + + class MarkdownConverter(object): class DefaultOptions: autolinks = True @@ -120,27 +137,23 @@ def process_tag(self, node, convert_as_inline, children_only=False): if not children_only and (isHeading or isCell): convert_children_as_inline = True - # Remove whitespace-only textnodes in purely nested nodes - def is_nested_node(el): - return el and el.name in ['ol', 'ul', 'li', - 'table', 'thead', 'tbody', 'tfoot', - 'tr', 'td', 'th'] - - if is_nested_node(node): - for el in node.children: - # Only extract (remove) whitespace-only text node if any of the - # conditions is true: - # - el is the first element in its parent - # - el is the last element in its parent - # - el is adjacent to an nested node - can_extract = (not el.previous_sibling - or not el.next_sibling - or is_nested_node(el.previous_sibling) - or is_nested_node(el.next_sibling)) - if (isinstance(el, NavigableString) - and six.text_type(el).strip() == '' - and can_extract): - el.extract() + # Remove whitespace-only textnodes just before, after or + # inside block-level elements. + remove_inside = remove_whitespace_inside(node) + for el in node.children: + # Only extract (remove) whitespace-only text node if any of the + # conditions is true: + # - el is the first element in its parent (block-level) + # - el is the last element in its parent (block-level) + # - el is adjacent to a block-level node + can_extract = (remove_inside and (not el.previous_sibling + or not el.next_sibling) + or remove_whitespace_outside(el.previous_sibling) + or remove_whitespace_outside(el.next_sibling)) + if (isinstance(el, NavigableString) + and six.text_type(el).strip() == '' + and can_extract): + el.extract() # Convert the children first for el in node.children: @@ -179,12 +192,16 @@ def process_text(self, el): if not el.find_parent(['pre', 'code', 'kbd', 'samp']): text = self.escape(text) - # remove trailing whitespaces if any of the following condition is true: - # - current text node is the last node in li - # - current text node is followed by an embedded list - if (el.parent.name == 'li' - and (not el.next_sibling - or el.next_sibling.name in ['ul', 'ol'])): + # remove leading whitespace at the start or just after a + # block-level element; remove traliing whitespace at the end + # or just before a block-level element. + if (remove_whitespace_outside(el.previous_sibling) + or (remove_whitespace_inside(el.parent) + and not el.previous_sibling)): + text = text.lstrip() + if (remove_whitespace_outside(el.next_sibling) + or (remove_whitespace_inside(el.parent) + and not el.next_sibling)): text = text.rstrip() return text @@ -257,7 +274,7 @@ def convert_a(self, el, text, convert_as_inline): def convert_blockquote(self, el, text, convert_as_inline): if convert_as_inline: - return text + return ' ' + text.strip() + ' ' return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else '' @@ -355,7 +372,7 @@ def convert_li(self, el, text, convert_as_inline): def convert_p(self, el, text, convert_as_inline): if convert_as_inline: - return text + return ' ' + text.strip() + ' ' if self.options['wrap']: # Preserve newlines (and preceding whitespace) resulting # from
tags. Newlines in the input have already been diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 9c1edc3..0be1d0c 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -66,7 +66,7 @@ def test_blockquote_with_paragraph(): def test_blockquote_nested(): text = md('
And she was like
Hello
') - assert text == '\n> And she was like \n> > Hello\n\n' + assert text == '\n> And she was like\n> > Hello\n\n' def test_br(): @@ -136,7 +136,7 @@ def test_hn(): def test_hn_chained(): - assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '\n# First\n\n\n## Second\n\n\n### Third\n\n' + assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '\n# First\n\n## Second\n\n### Third\n\n' assert md('X

First

', heading_style=ATX) == 'X\n# First\n\n' assert md('X

First

', heading_style=ATX_CLOSED) == 'X\n# First #\n\n' assert md('X

First

') == 'X\n\nFirst\n=====\n\n' @@ -196,7 +196,7 @@ def test_head(): def test_hr(): assert md('Hello
World') == 'Hello\n\n---\n\nWorld' assert md('Hello
World') == 'Hello\n\n---\n\nWorld' - assert md('

Hello

\n
\n

World

') == '\n\nHello\n\n\n---\n\n\nWorld\n\n' + assert md('

Hello

\n
\n

World

') == '\n\nHello\n\n---\n\nWorld\n\n' def test_i(): @@ -303,3 +303,13 @@ def callback(el): assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n```python\ntest\n foo\nbar\n```\n' assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n' assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n' + + +def test_spaces(): + assert md('

a b

c d

') == '\n\na b\n\nc d\n\n' + assert md('

a

') == '\n\n*a*\n\n' + assert md('test

again

') == 'test\n\nagain\n\n' + assert md('test
text
after') == 'test\n> text\n\nafter' + assert md('
  1. x
  2. y
') == '\n\n1. x\n2. y\n' + assert md('