Workaround (this relies on the fix for #92 to be applied):

>>> text = '<div>foo</div><div>bar<span>baz</span><span>meow</span></div>'
>>> html = bs4.BeautifulSoup(text, 'html.parser')
>>> for e in html.find_all('div'):
...     e.insert_before(html.new_tag('br'))
... 
>>> text = MarkdownConverter().convert_soup(html)
>>> text = re.sub('  \n  \n', '\n\n', text)
>>> text = re.sub(' *\n\n+', '\n\n', text).strip()
>>> text
'foo  \nbarbazmeow'

Maybe it helps someone.

For the sake of completeness, the following is my current complete example of how I clean up HTML from RSS feeds to post it, as Markdown, to Fediverse (called as cleanup(post.x) where x is title, summary, content, …), which includes a number of workarounds for bad input and limits of the conversion tools:

def _cleanup_tablish(tag):
    for e in tag.contents:
        if isinstance(e, bs4.element.NavigableString) and str(e).strip() == '':
            e.extract()
            return True
    return False

def _cleanup_table(top):
    tag = top
    while isinstance(tag, bs4.element.Tag) and \
      tag.name in ('table', 'tbody', 'tr', 'th', 'td'):
        while _cleanup_tablish(tag):
            pass
        have_tablish = False
        have_nontablish = False
        have_elts = 0
        for e in tag.contents:
            if isinstance(e, bs4.element.NavigableString):
                have_nontablish = True
            elif e.name in ('table', 'tbody', 'tr', 'th', 'td'):
                have_tablish = True
            else:
                have_nontablish = True
            have_elts = have_elts + 1
        if have_elts == 0:
            top.extract()
            return
        if have_nontablish:
            if have_tablish:
                # huh?
                return
            tag.name = 'div'
            tag.attrs.clear()
            e = tag.contents[0]
            if have_elts == 1 and isinstance(e, bs4.element.Tag) and \
              e.name in bs4.builder.HTMLTreeBuilder.block_elements:
                tag = e
            if tag != top:
                top.replace_with(tag)
            return
        if have_elts > 1:
            return
        tag = tag.contents[0]

_cleanup_traildots = re.compile('\\.\\.\\.$')
def cleanup(text):
    text = re.sub('\r+\n?', '\n', text)
    html = bs4.BeautifulSoup(text, 'html.parser', multi_valued_attributes=None)
    # remove <!-- comments -->
    for e in html.find_all(string=lambda e: isinstance(e, bs4.element.Comment)):
        e.extract()
    # flatten tables with only one cell (Goodreads)
    for e in html.find_all('table'):
        _cleanup_table(e)
    # expand shortened links
    for e in html.find_all('a', href=True, string=_cleanup_traildots):
        href = str(e['href'])
        if href.startswith(str(e.string).rstrip('.')):
            e.string.replace_with(href)
    # temporarily move <pre>s aside
    pres = []
    npres = 0
    for pre in html.find_all('pre'):
        pres.append(pre.replace_with(html.new_tag('rpre', num=npres)))
        npres = npres + 1
    # clean whitespace except in the extracted <pre>s
    text = str(html)
    text = re.sub(' *\n *', '\n', text)
    text = text.replace('\n', '\1')
    text = re.sub('\1\1\1+', '\n\n', text)
    text = re.sub('\1+ *', ' ', text).strip()
    text = re.sub('[\t ]+', ' ', text)
    # bring back the extracted <pre>s
    html = bs4.BeautifulSoup(text, 'html.parser')
    for pre in html.find_all('rpre'):
        pre.replace_with(pres[int(pre.attrs['num'])])
    # work around https://github.com/matthewwithanm/python-markdownify/issues/107
    for e in html.find_all('div'):
        e.insert_before(html.new_tag('br'))
    # convert and clean up
    text = MarkdownConverter(strip=['img']).convert_soup(html)
    text = re.sub('  \n  \n', '\n\n', '\n' + text + '\n')
    text = re.sub('(\n> )+\n', '\n> \n', '\n' + text + '\n')
    text = re.sub(' *\n\n+', '\n\n', text)
    return text.strip()

div mis-converted #107

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions