Skip to content

Extract rich text from blocks #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
__pycache__
build/
*.docx
*.pptx
*.pptx
scratch/
1 change: 1 addition & 0 deletions jsondoc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

127 changes: 79 additions & 48 deletions jsondoc/convert/html.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from types import NoneType
from typing import List, Union
from typing import Callable, List, Union

from bs4 import BeautifulSoup, Comment, Doctype, NavigableString
from pydantic import BaseModel
Expand Down Expand Up @@ -45,7 +45,7 @@
from jsondoc.models.page import Page
from jsondoc.models.shared_definitions import Annotations
from jsondoc.rules import is_block_child_allowed
from jsondoc.utils import generate_id, get_current_time
from jsondoc.utils import generate_block_id, get_current_time

line_beginning_re = re.compile(r"^", re.MULTILINE)
whitespace_re = re.compile(r"[\t ]+")
Expand Down Expand Up @@ -307,7 +307,9 @@ def reconcile_to_rich_text(


def reconcile_to_block(
block: BlockBase, children: List[CHILDREN_TYPE]
block: BlockBase,
children: List[CHILDREN_TYPE],
typeid: bool = False,
) -> List[CHILDREN_TYPE]:
"""
Given a block and a list of children,
Expand Down Expand Up @@ -350,7 +352,7 @@ def reconcile_to_block(
# Get corresponding field from the block
block_field = getattr(block, block_type)
init_kwargs = {
"id": generate_id(),
"id": generate_block_id(typeid=typeid),
"created_time": child.created_time,
block_type: type(block_field)(),
}
Expand Down Expand Up @@ -383,26 +385,20 @@ def reconcile_to_block(


class HtmlToJsonDocConverter(object):
class DefaultOptions:
autolinks = True
code_language = ""
code_language_callback = None
convert = None
default_title = False
keep_inline_images_in = []
strip = None
force_page = False

class Options(DefaultOptions):
pass
class Options(BaseModel):
autolinks: bool = True
code_language: str = ""
code_language_callback: Callable | None = None
convert: Callable | None = None
default_title: bool = False
keep_inline_images_in: list[str] = []
strip: str | None = None
force_page: bool = False
typeid: bool = False

def __init__(self, **options):
# Create an options dictionary. Use DefaultOptions as a base so that
# it doesn't have to be extended.
self.options = _todict(self.DefaultOptions)
self.options.update(_todict(self.Options))
self.options.update(options)
if self.options["strip"] is not None and self.options["convert"] is not None:
self.options = self.Options(**options)
if self.options.strip is not None and self.options.convert is not None:
raise ValueError(
"You may specify either tags to strip or tags to convert, but not both."
)
Expand All @@ -417,7 +413,7 @@ def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase
is_page = self._is_soup_page(soup)

ret = None
if is_page or self.options["force_page"]:
if is_page or self.options.force_page:
title = self._get_html_title(soup)
# Ensure that children is a list
if not isinstance(children, list):
Expand All @@ -427,6 +423,7 @@ def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase
ret = create_page(
title=title,
children=children,
typeid=self.options.typeid,
)
else:
ret = children
Expand Down Expand Up @@ -526,7 +523,11 @@ def is_nested_node(el):
if current_level_object is None:
objects = children_objects
elif isinstance(current_level_object, BlockBase):
objects = reconcile_to_block(current_level_object, children_objects)
objects = reconcile_to_block(
current_level_object,
children_objects,
typeid=self.options.typeid,
)
elif isinstance(current_level_object, RichTextBase):
objects = reconcile_to_rich_text(current_level_object, children_objects)
else:
Expand Down Expand Up @@ -615,8 +616,8 @@ def process_text(self, el):

def should_convert_tag(self, tag):
tag = tag.lower()
strip = self.options["strip"]
convert = self.options["convert"]
strip = self.options.strip
convert = self.options.convert
if strip is not None:
return tag not in strip
elif convert is not None:
Expand All @@ -629,7 +630,7 @@ def convert_a(self, el, convert_as_inline):
return ConvertOutput(main_object=create_rich_text(url=href))

convert_b = abstract_inline_conversion(
lambda self: Annotations(bold=True) # 2 * self.options["strong_em_symbol"]
lambda self: Annotations(bold=True) # 2 * self.options.strong_em_symbol
)

def convert_blockquote(self, el, convert_as_inline):
Expand All @@ -646,7 +647,11 @@ def convert_blockquote(self, el, convert_as_inline):
return ConvertOutput(main_object=create_rich_text())

# TODO: If text has newlines, split them and add 2, 3, ... lines as children
return ConvertOutput(main_object=create_quote_block())
return ConvertOutput(
main_object=create_quote_block(
typeid=self.options.typeid,
)
)

def convert_br(self, el, convert_as_inline):
if convert_as_inline:
Expand Down Expand Up @@ -683,40 +688,48 @@ def convert_h1(self, el, convert_as_inline):
if convert_as_inline:
return ConvertOutput(main_object=create_rich_text())

return ConvertOutput(main_object=create_h1_block())
return ConvertOutput(main_object=create_h1_block(typeid=self.options.typeid))

def convert_h2(self, el, convert_as_inline):
if convert_as_inline:
return ConvertOutput(main_object=create_rich_text())

return ConvertOutput(main_object=create_h2_block())
return ConvertOutput(main_object=create_h2_block(typeid=self.options.typeid))

def convert_h3(self, el, convert_as_inline):
if convert_as_inline:
return ConvertOutput(main_object=create_rich_text())

return ConvertOutput(main_object=create_h3_block())
return ConvertOutput(main_object=create_h3_block(typeid=self.options.typeid))

def convert_h4(self, el, convert_as_inline):
if convert_as_inline:
return ConvertOutput(main_object=create_rich_text())

return ConvertOutput(main_object=create_paragraph_block())
return ConvertOutput(
main_object=create_paragraph_block(typeid=self.options.typeid)
)

def convert_h5(self, el, convert_as_inline):
if convert_as_inline:
return ConvertOutput(main_object=create_rich_text())

return ConvertOutput(main_object=create_paragraph_block())
return ConvertOutput(
main_object=create_paragraph_block(typeid=self.options.typeid)
)

def convert_h6(self, el, convert_as_inline):
if convert_as_inline:
return ConvertOutput(main_object=create_rich_text())

return ConvertOutput(main_object=create_paragraph_block())
return ConvertOutput(
main_object=create_paragraph_block(typeid=self.options.typeid)
)

def convert_hr(self, el, convert_as_inline):
return ConvertOutput(main_object=create_divider_block())
return ConvertOutput(
main_object=create_divider_block(typeid=self.options.typeid)
)

convert_i = convert_em

Expand All @@ -730,13 +743,14 @@ def convert_img(self, el, convert_as_inline):
# title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
if (
convert_as_inline
and el.parent.name not in self.options["keep_inline_images_in"]
and el.parent.name not in self.options.keep_inline_images_in
):
return alt

return ConvertOutput(
main_object=create_image_block(
url=src,
typeid=self.options.typeid,
# alt is not supported in JSON-DOC yet
# caption=alt,
)
Expand All @@ -755,28 +769,38 @@ def convert_list(self, el, convert_as_inline):
def convert_li(self, el, convert_as_inline):
parent = el.parent
if parent is not None and parent.name == "ol":
return ConvertOutput(main_object=create_numbered_list_item_block())
return ConvertOutput(
main_object=create_numbered_list_item_block(typeid=self.options.typeid)
)
else:
return ConvertOutput(main_object=create_bullet_list_item_block())
return ConvertOutput(
main_object=create_bullet_list_item_block(typeid=self.options.typeid)
)

def convert_p(self, el, convert_as_inline):
if convert_as_inline:
return ConvertOutput(main_object=create_rich_text())

return ConvertOutput(main_object=create_paragraph_block())
return ConvertOutput(
main_object=create_paragraph_block(typeid=self.options.typeid)
)

def convert_pre(self, el, convert_as_inline):
text = el.get_text()

if not text:
return None

code_language = self.options["code_language"]
code_language = self.options.code_language

if self.options["code_language_callback"]:
code_language = self.options["code_language_callback"](el) or code_language
if self.options.code_language_callback:
code_language = self.options.code_language_callback(el) or code_language

return ConvertOutput(main_object=create_code_block(language=code_language))
return ConvertOutput(
main_object=create_code_block(
language=code_language, typeid=self.options.typeid
)
)

def convert_script(self, el, convert_as_inline):
return None
Expand All @@ -793,19 +817,19 @@ def convert_style(self, el, convert_as_inline):
# Notion does not have an alternative for sub and sup tags
convert_sub = abstract_inline_conversion(
lambda self: Annotations()
# self.options["sub_symbol"],
# self.options.sub_symbol,
)

convert_sup = abstract_inline_conversion(
lambda self: Annotations()
# self.options["sup_symbol"],
# self.options.sup_symbol,
)

def convert_table(self, el, convert_as_inline):
has_column_header = html_table_has_header_row(el)
return ConvertOutput(
main_object=create_table_block(
has_column_header=has_column_header,
has_column_header=has_column_header, typeid=self.options.typeid
)
)

Expand Down Expand Up @@ -841,10 +865,15 @@ def convert_td(self, el, convert_as_inline):
paragraph_block.rich_text will be extracted to form table_row.cells.
"""
# Get colspan
colspan = el.get("colspan", 1)
colspan = el.get("colspan", "1")
# Get rowspan
# rowspan = el.get("rowspan", 1)
# We need to come up with a much different way to handle rowspan
if not isinstance(colspan, int):
try:
colspan = int(colspan)
except ValueError:
colspan = 1

next_objects = []
if colspan > 1:
Expand All @@ -863,7 +892,9 @@ def convert_tr(self, el, convert_as_inline):
"""
Table row
"""
return ConvertOutput(main_object=create_table_row_block())
return ConvertOutput(
main_object=create_table_row_block(typeid=self.options.typeid)
)


def html_to_jsondoc(html: str | bytes, **options) -> Page | BlockBase | List[BlockBase]:
Expand Down
Loading
Loading