From ba58208a859df783a6202ea658b47f324b6609bb Mon Sep 17 00:00:00 2001
From: Kevin Shi <kevinshisvf@gmail.com>
Date: Thu, 20 Jun 2024 11:43:15 -0600
Subject: [PATCH] Transform HTML links to markdown behind config option (#1671)

---
 backend/danswer/configs/app_configs.py        |  6 ++++
 backend/danswer/file_processing/enums.py      |  8 ++++++
 backend/danswer/file_processing/html_utils.py | 28 +++++++++++++++++--
 3 files changed, 40 insertions(+), 2 deletions(-)
 create mode 100644 backend/danswer/file_processing/enums.py

diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py
index 996700aef48..d12c0d03c73 100644
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@@ -4,6 +4,7 @@
 
 from danswer.configs.constants import AuthType
 from danswer.configs.constants import DocumentIndexType
+from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy
 
 #####
 # App Configs
@@ -160,6 +161,11 @@
 WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL")
 WEB_CONNECTOR_VALIDATE_URLS = os.environ.get("WEB_CONNECTOR_VALIDATE_URLS")
 
+HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY = os.environ.get(
+    "HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY",
+    HtmlBasedConnectorTransformLinksStrategy.STRIP,
+)
+
 NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = (
     os.environ.get("NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP", "").lower()
     == "true"
diff --git a/backend/danswer/file_processing/enums.py b/backend/danswer/file_processing/enums.py
new file mode 100644
index 00000000000..f532d0ebfcc
--- /dev/null
+++ b/backend/danswer/file_processing/enums.py
@@ -0,0 +1,8 @@
+from enum import Enum
+
+
+class HtmlBasedConnectorTransformLinksStrategy(str, Enum):
+    # remove links entirely
+    STRIP = "strip"
+    # turn HTML links into markdown links
+    MARKDOWN = "markdown"
diff --git a/backend/danswer/file_processing/html_utils.py b/backend/danswer/file_processing/html_utils.py
index 9b5875227a0..48782981f89 100644
--- a/backend/danswer/file_processing/html_utils.py
+++ b/backend/danswer/file_processing/html_utils.py
@@ -5,8 +5,10 @@
 
 import bs4
 
+from danswer.configs.app_configs import HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
 from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
 from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
+from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy
 
 MINTLIFY_UNWANTED = ["sticky", "hidden"]
 
@@ -32,6 +34,19 @@ def strip_newlines(document: str) -> str:
     return re.sub(r"[\n\r]+", " ", document)
 
 
+def format_element_text(element_text: str, link_href: str | None) -> str:
+    element_text_no_newlines = strip_newlines(element_text)
+
+    if (
+        not link_href
+        or HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
+        == HtmlBasedConnectorTransformLinksStrategy.STRIP
+    ):
+        return element_text_no_newlines
+
+    return f"[{element_text_no_newlines}]({link_href})"
+
+
 def format_document_soup(
     document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
 ) -> str:
@@ -49,6 +64,8 @@ def format_document_soup(
     verbatim_output = 0
     in_table = False
     last_added_newline = False
+    link_href: str | None = None
+
     for e in document.descendants:
         verbatim_output -= 1
         if isinstance(e, bs4.element.NavigableString):
@@ -71,7 +88,7 @@ def format_document_soup(
                 content_to_add = (
                     element_text
                     if verbatim_output > 0
-                    else strip_newlines(element_text)
+                    else format_element_text(element_text, link_href)
                 )
 
                 # Don't join separate elements without any spacing
@@ -98,7 +115,14 @@ def format_document_soup(
             elif in_table:
                 # don't handle other cases while in table
                 pass
-
+            elif e.name == "a":
+                href_value = e.get("href", None)
+                # mostly for typing, having multiple hrefs is not valid HTML
+                link_href = (
+                    href_value[0] if isinstance(href_value, list) else href_value
+                )
+            elif e.name == "/a":
+                link_href = None
             elif e.name in ["p", "div"]:
                 if not list_element_start:
                     text += "\n"