-
Notifications
You must be signed in to change notification settings - Fork 198
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: update content conversion patch
- Ensure tags like <iframe>, <video>, etc. are retained when converting Wiki content to markdown.
- Loading branch information
1 parent
3502304
commit 639a0af
Showing
5 changed files
with
80 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
72 changes: 72 additions & 0 deletions
72
wiki/wiki/doctype/wiki_page/patches/convert_wiki_content_to_markdown.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import re | ||
|
||
import frappe | ||
import six | ||
from bs4 import Comment, Doctype, NavigableString | ||
from markdownify import MarkdownConverter | ||
|
||
html_heading_re = re.compile(r"h[1-6]") | ||
|
||
|
||
class CustomMarkdownConverter(MarkdownConverter): | ||
# overeride markdownify's process_tag function to escape certain html tags | ||
def process_tag(self, node, convert_as_inline, children_only=False): | ||
text = "" | ||
|
||
# markdown headings or cells can't include | ||
# block elements (elements w/newlines) | ||
isHeading = html_heading_re.match(node.name) is not None | ||
isCell = node.name in ["td", "th"] | ||
convert_children_as_inline = convert_as_inline | ||
|
||
if not children_only and (isHeading or isCell): | ||
convert_children_as_inline = True | ||
|
||
# Remove whitespace-only textnodes in purely nested nodes | ||
def is_nested_node(el): | ||
return el and el.name in ["ol", "ul", "li", "table", "thead", "tbody", "tfoot", "tr", "td", "th"] | ||
|
||
if is_nested_node(node): | ||
for el in node.children: | ||
# Only extract (remove) whitespace-only text node if any of the | ||
# conditions is true: | ||
# - el is the first element in its parent | ||
# - el is the last element in its parent | ||
# - el is adjacent to an nested node | ||
can_extract = ( | ||
not el.previous_sibling | ||
or not el.next_sibling | ||
or is_nested_node(el.previous_sibling) | ||
or is_nested_node(el.next_sibling) | ||
) | ||
if isinstance(el, NavigableString) and six.text_type(el).strip() == "" and can_extract: | ||
el.extract() | ||
|
||
# Convert the children first | ||
for el in node.children: | ||
if isinstance(el, Comment) or isinstance(el, Doctype): | ||
continue | ||
elif isinstance(el, NavigableString): | ||
text += self.process_text(el) | ||
else: | ||
if el.name in ["video", "iframe", "audio", "embed", "object", "source", "picture", "math"]: | ||
text += self.process_text(el) | ||
text += self.process_tag(el, convert_children_as_inline) | ||
|
||
if not children_only: | ||
convert_fn = getattr(self, f"convert_{node.name}", None) | ||
if convert_fn and self.should_convert_tag(node.name): | ||
text = convert_fn(node, text, convert_as_inline) | ||
|
||
return text | ||
|
||
|
||
def custom_markdownify(html, **options): | ||
return CustomMarkdownConverter(**options).convert(html) | ||
|
||
|
||
def execute(): | ||
wiki_pages = frappe.db.get_all("Wiki Page", fields=["name", "content"]) | ||
for page in wiki_pages: | ||
markdown_content = custom_markdownify(page["content"]) | ||
frappe.db.set_value("Wiki Page", page["name"], "content", markdown_content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,7 +49,7 @@ | |
dependencies: | ||
type-fest "^2.0.0" | ||
|
||
"@tiptap/core@^2.0.0", "@tiptap/core@^2.0.2": | ||
"@tiptap/core@^2.0.2": | ||
version "2.0.2" | ||
resolved "https://registry.npmjs.org/@tiptap/core/-/core-2.0.2.tgz" | ||
integrity sha512-DBry6tpX7mYaTJkEDjVA4WmF8Kgthr275L0uIIOVdwW5nG5PAnOvREKyVOoMQnN3vR7CjtaCK+c3y+MCQhMA/g== | ||
|
@@ -74,7 +74,7 @@ | |
resolved "https://registry.npmjs.org/@tiptap/extension-code-block-lowlight/-/extension-code-block-lowlight-2.0.2.tgz" | ||
integrity sha512-7BbRCKJE2oxsZ5n7HIjS0r/y1S/bSxEJgAFF1Tj3KN2IG3x48w+sqYxRMYmCZdoTexmmBpNF64uYXngKXB9/Ig== | ||
|
||
"@tiptap/extension-code-block@^2.0.0", "@tiptap/extension-code-block@^2.0.2": | ||
"@tiptap/extension-code-block@^2.0.2": | ||
version "2.0.2" | ||
resolved "https://registry.npmjs.org/@tiptap/extension-code-block/-/extension-code-block-2.0.2.tgz" | ||
integrity sha512-GL8ogok1tl1FkXwk0P0ZWYh6oAmSA+R3oubtDZJG1fLlezKLcLYCN/Q2jgYDHDwEOnxMc4JIiT7EYwJ0pqmNaQ== | ||
|
@@ -201,7 +201,7 @@ | |
resolved "https://registry.npmjs.org/@tiptap/extension-text/-/extension-text-2.0.2.tgz" | ||
integrity sha512-kAO+WurWOyHIV/x8qHMF3bSlWrdlPtjEYmf+w8wHKy3FzE55eF6SsGt4FymClNkJmyXdgflXBB3Wv/Z53myy8g== | ||
|
||
"@tiptap/pm@^2.0.0", "@tiptap/pm@^2.0.2": | ||
"@tiptap/pm@^2.0.2": | ||
version "2.0.2" | ||
resolved "https://registry.npmjs.org/@tiptap/pm/-/pm-2.0.2.tgz" | ||
integrity sha512-vXlI82bZ4XrmVD6m/pO27gqlm+tU57mpjy9WjkJpEUOifQZK8LihR3l5k55Z0RqalV4/E79iU1cp8mw0v13nhA== | ||
|
@@ -580,7 +580,7 @@ prosemirror-menu@^1.2.1: | |
prosemirror-history "^1.0.0" | ||
prosemirror-state "^1.0.0" | ||
|
||
prosemirror-model@^1, prosemirror-model@^1.0.0, prosemirror-model@^1.16.0, prosemirror-model@^1.18.1, prosemirror-model@^1.19.0, prosemirror-model@^1.8.1: | ||
prosemirror-model@^1.0.0, prosemirror-model@^1.16.0, prosemirror-model@^1.18.1, prosemirror-model@^1.19.0, prosemirror-model@^1.8.1: | ||
version "1.19.0" | ||
resolved "https://registry.npmjs.org/prosemirror-model/-/prosemirror-model-1.19.0.tgz" | ||
integrity sha512-/CvFGJnwc41EJSfDkQLly1cAJJJmBpZwwUJtwZPTjY2RqZJfM8HVbCreOY/jti8wTRbVyjagcylyGoeJH/g/3w== | ||
|
@@ -603,7 +603,7 @@ prosemirror-schema-list@^1.2.2: | |
prosemirror-state "^1.0.0" | ||
prosemirror-transform "^1.0.0" | ||
|
||
prosemirror-state@^1, prosemirror-state@^1.0.0, prosemirror-state@^1.2.2, prosemirror-state@^1.3.1, prosemirror-state@^1.4.1: | ||
prosemirror-state@^1.0.0, prosemirror-state@^1.2.2, prosemirror-state@^1.3.1, prosemirror-state@^1.4.1: | ||
version "1.4.2" | ||
resolved "https://registry.npmjs.org/prosemirror-state/-/prosemirror-state-1.4.2.tgz" | ||
integrity sha512-puuzLD2mz/oTdfgd8msFbe0A42j5eNudKAAPDB0+QJRw8cO1ygjLmhLrg9RvDpf87Dkd6D4t93qdef00KKNacQ== | ||
|
@@ -640,7 +640,7 @@ prosemirror-transform@^1.0.0, prosemirror-transform@^1.1.0, prosemirror-transfor | |
dependencies: | ||
prosemirror-model "^1.0.0" | ||
|
||
prosemirror-view@^1, prosemirror-view@^1.0.0, prosemirror-view@^1.1.0, prosemirror-view@^1.13.3, prosemirror-view@^1.27.0, prosemirror-view@^1.28.2: | ||
prosemirror-view@^1.0.0, prosemirror-view@^1.1.0, prosemirror-view@^1.13.3, prosemirror-view@^1.27.0, prosemirror-view@^1.28.2: | ||
version "1.30.2" | ||
resolved "https://registry.npmjs.org/prosemirror-view/-/prosemirror-view-1.30.2.tgz" | ||
integrity sha512-nTNzZvalQf9kHeEyO407LiV6DoOs/pXsid88UqW9Vvybo4ozJW2PJhkfZUxCUF1hR/9vJLdhxX84wuw9P9HsXA== | ||
|
@@ -739,7 +739,7 @@ w3c-keyname@^2.2.0: | |
resolved "https://registry.npmjs.org/w3c-keyname/-/w3c-keyname-2.2.6.tgz" | ||
integrity sha512-f+fciywl1SJEniZHD6H+kUO8gOnwIr7f4ijKA6+ZvJFjeGi1r4PDLl53Ayud9O/rk64RqgoQine0feoeOU0kXg== | ||
|
||
which@^1.2.9, [email protected].x: | ||
[email protected].x, which@^1.2.9: | ||
version "1.2.14" | ||
resolved "https://registry.npmjs.org/which/-/which-1.2.14.tgz" | ||
integrity sha512-16uPglFkRPzgiUXYMi1Jf8Z5EzN1iB4V0ZtMXcHZnwsBtQhhHeCqoWw7tsUY42hJGNDWtUsVLTjakIa5BgAxCw== | ||
|