Skip to content

Commit

Permalink
Transform HTML links to markdown behind config option (onyx-dot-app#1671
Browse files Browse the repository at this point in the history
)
  • Loading branch information
KevinShiCA authored Jun 20, 2024
1 parent 9e30ec1 commit ba58208
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 2 deletions.
6 changes: 6 additions & 0 deletions backend/danswer/configs/app_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from danswer.configs.constants import AuthType
from danswer.configs.constants import DocumentIndexType
from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy

#####
# App Configs
Expand Down Expand Up @@ -160,6 +161,11 @@
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL")
WEB_CONNECTOR_VALIDATE_URLS = os.environ.get("WEB_CONNECTOR_VALIDATE_URLS")

HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY = os.environ.get(
"HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY",
HtmlBasedConnectorTransformLinksStrategy.STRIP,
)

NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = (
os.environ.get("NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP", "").lower()
== "true"
Expand Down
8 changes: 8 additions & 0 deletions backend/danswer/file_processing/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from enum import Enum


class HtmlBasedConnectorTransformLinksStrategy(str, Enum):
# remove links entirely
STRIP = "strip"
# turn HTML links into markdown links
MARKDOWN = "markdown"
28 changes: 26 additions & 2 deletions backend/danswer/file_processing/html_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@

import bs4

from danswer.configs.app_configs import HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy

MINTLIFY_UNWANTED = ["sticky", "hidden"]

Expand All @@ -32,6 +34,19 @@ def strip_newlines(document: str) -> str:
return re.sub(r"[\n\r]+", " ", document)


def format_element_text(element_text: str, link_href: str | None) -> str:
element_text_no_newlines = strip_newlines(element_text)

if (
not link_href
or HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
== HtmlBasedConnectorTransformLinksStrategy.STRIP
):
return element_text_no_newlines

return f"[{element_text_no_newlines}]({link_href})"


def format_document_soup(
document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
) -> str:
Expand All @@ -49,6 +64,8 @@ def format_document_soup(
verbatim_output = 0
in_table = False
last_added_newline = False
link_href: str | None = None

for e in document.descendants:
verbatim_output -= 1
if isinstance(e, bs4.element.NavigableString):
Expand All @@ -71,7 +88,7 @@ def format_document_soup(
content_to_add = (
element_text
if verbatim_output > 0
else strip_newlines(element_text)
else format_element_text(element_text, link_href)
)

# Don't join separate elements without any spacing
Expand All @@ -98,7 +115,14 @@ def format_document_soup(
elif in_table:
# don't handle other cases while in table
pass

elif e.name == "a":
href_value = e.get("href", None)
# mostly for typing, having multiple hrefs is not valid HTML
link_href = (
href_value[0] if isinstance(href_value, list) else href_value
)
elif e.name == "/a":
link_href = None
elif e.name in ["p", "div"]:
if not list_element_start:
text += "\n"
Expand Down

0 comments on commit ba58208

Please sign in to comment.