From ba58208a859df783a6202ea658b47f324b6609bb Mon Sep 17 00:00:00 2001 From: Kevin Shi Date: Thu, 20 Jun 2024 11:43:15 -0600 Subject: [PATCH] Transform HTML links to markdown behind config option (#1671) --- backend/danswer/configs/app_configs.py | 6 ++++ backend/danswer/file_processing/enums.py | 8 ++++++ backend/danswer/file_processing/html_utils.py | 28 +++++++++++++++++-- 3 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 backend/danswer/file_processing/enums.py diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 996700aef48..d12c0d03c73 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -4,6 +4,7 @@ from danswer.configs.constants import AuthType from danswer.configs.constants import DocumentIndexType +from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy ##### # App Configs @@ -160,6 +161,11 @@ WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL") WEB_CONNECTOR_VALIDATE_URLS = os.environ.get("WEB_CONNECTOR_VALIDATE_URLS") +HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY = os.environ.get( + "HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY", + HtmlBasedConnectorTransformLinksStrategy.STRIP, +) + NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = ( os.environ.get("NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP", "").lower() == "true" diff --git a/backend/danswer/file_processing/enums.py b/backend/danswer/file_processing/enums.py new file mode 100644 index 00000000000..f532d0ebfcc --- /dev/null +++ b/backend/danswer/file_processing/enums.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class HtmlBasedConnectorTransformLinksStrategy(str, Enum): + # remove links entirely + STRIP = "strip" + # turn HTML links into markdown links + MARKDOWN = "markdown" diff --git a/backend/danswer/file_processing/html_utils.py b/backend/danswer/file_processing/html_utils.py index 9b5875227a0..48782981f89 100644 --- a/backend/danswer/file_processing/html_utils.py +++ b/backend/danswer/file_processing/html_utils.py @@ -5,8 +5,10 @@ import bs4 +from danswer.configs.app_configs import HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS +from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy MINTLIFY_UNWANTED = ["sticky", "hidden"] @@ -32,6 +34,19 @@ def strip_newlines(document: str) -> str: return re.sub(r"[\n\r]+", " ", document) +def format_element_text(element_text: str, link_href: str | None) -> str: + element_text_no_newlines = strip_newlines(element_text) + + if ( + not link_href + or HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY + == HtmlBasedConnectorTransformLinksStrategy.STRIP + ): + return element_text_no_newlines + + return f"[{element_text_no_newlines}]({link_href})" + + def format_document_soup( document: bs4.BeautifulSoup, table_cell_separator: str = "\t" ) -> str: @@ -49,6 +64,8 @@ def format_document_soup( verbatim_output = 0 in_table = False last_added_newline = False + link_href: str | None = None + for e in document.descendants: verbatim_output -= 1 if isinstance(e, bs4.element.NavigableString): @@ -71,7 +88,7 @@ def format_document_soup( content_to_add = ( element_text if verbatim_output > 0 - else strip_newlines(element_text) + else format_element_text(element_text, link_href) ) # Don't join separate elements without any spacing @@ -98,7 +115,14 @@ def format_document_soup( elif in_table: # don't handle other cases while in table pass - + elif e.name == "a": + href_value = e.get("href", None) + # mostly for typing, having multiple hrefs is not valid HTML + link_href = ( + href_value[0] if isinstance(href_value, list) else href_value + ) + elif e.name == "/a": + link_href = None elif e.name in ["p", "div"]: if not list_element_start: text += "\n"