Skip to content

Commit

Permalink
Miscellaneous indexing fixes (#4042)
Browse files Browse the repository at this point in the history
  • Loading branch information
pablonyx authored Feb 19, 2025
1 parent 353c185 commit 99fc546
Show file tree
Hide file tree
Showing 10 changed files with 162 additions and 34 deletions.
2 changes: 2 additions & 0 deletions backend/onyx/connectors/bookstack/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

class BookStackClientRequestFailedError(ConnectionError):
def __init__(self, status: int, error: str) -> None:
self.status_code = status
self.error = error
super().__init__(
"BookStack Client request failed with status {status}: {error}".format(
status=status, error=error
Expand Down
40 changes: 40 additions & 0 deletions backend/onyx/connectors/bookstack/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@
from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.constants import DocumentSource
from onyx.connectors.bookstack.client import BookStackApiClient
from onyx.connectors.bookstack.client import BookStackClientRequestFailedError
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from onyx.connectors.interfaces import ConnectorValidationError
from onyx.connectors.interfaces import CredentialExpiredError
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import InsufficientPermissionsError
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
Expand Down Expand Up @@ -214,3 +218,39 @@ def poll_source(
break
else:
time.sleep(0.2)

def validate_connector_settings(self) -> None:
"""
Validate that the BookStack credentials and connector settings are correct.
Specifically checks that we can make an authenticated request to BookStack.
"""
if not self.bookstack_client:
raise ConnectorMissingCredentialError(
"BookStack credentials have not been loaded."
)

try:
# Attempt to fetch a small batch of books (arbitrary endpoint) to verify credentials
_ = self.bookstack_client.get(
"/books", params={"count": "1", "offset": "0"}
)

except BookStackClientRequestFailedError as e:
# Check for HTTP status codes
if e.status_code == 401:
raise CredentialExpiredError(
"Your BookStack credentials appear to be invalid or expired (HTTP 401)."
) from e
elif e.status_code == 403:
raise InsufficientPermissionsError(
"The configured BookStack token does not have sufficient permissions (HTTP 403)."
) from e
else:
raise ConnectorValidationError(
f"Unexpected BookStack error (status={e.status_code}): {e}"
) from e

except Exception as exc:
raise ConnectorValidationError(
f"Unexpected error while validating BookStack connector settings: {exc}"
) from exc
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from collections.abc import Callable
from collections.abc import Iterator
from datetime import datetime
Expand All @@ -24,16 +25,22 @@ def datetime_to_utc(dt: datetime) -> datetime:


def time_str_to_utc(datetime_str: str) -> datetime:
# Remove all timezone abbreviations in parentheses
datetime_str = re.sub(r"\([A-Z]+\)", "", datetime_str).strip()

# Remove any remaining parentheses and their contents
datetime_str = re.sub(r"\(.*?\)", "", datetime_str).strip()

try:
dt = parse(datetime_str)
except ValueError:
# Handle malformed timezone by attempting to fix common format issues
# Fix common format issues (e.g. "0000" => "+0000")
if "0000" in datetime_str:
# Convert "0000" to "+0000" for proper timezone parsing
fixed_dt_str = datetime_str.replace(" 0000", " +0000")
dt = parse(fixed_dt_str)
datetime_str = datetime_str.replace(" 0000", " +0000")
dt = parse(datetime_str)
else:
raise

return datetime_to_utc(dt)


Expand Down
3 changes: 1 addition & 2 deletions backend/onyx/connectors/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,6 @@ def validate_ccpair_for_user(
tenant_id=tenant_id,
)
except Exception as e:
error_msg = f"Unexpected error creating connector: {e}"
raise ConnectorValidationError(error_msg)
raise ConnectorValidationError(str(e))

runnable_connector.validate_connector_settings()
10 changes: 5 additions & 5 deletions backend/onyx/connectors/fireflies/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,12 @@ def load_from_state(self) -> GenerateDocumentsOutput:
return self._process_transcripts()

def poll_source(
self, start_unixtime: SecondsSinceUnixEpoch, end_unixtime: SecondsSinceUnixEpoch
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
start_datetime = datetime.fromtimestamp(
start_unixtime, tz=timezone.utc
).strftime("%Y-%m-%dT%H:%M:%S.000Z")
end_datetime = datetime.fromtimestamp(end_unixtime, tz=timezone.utc).strftime(
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%S.000Z"
)
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%S.000Z"
)

Expand Down
3 changes: 2 additions & 1 deletion backend/onyx/connectors/github/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import UnexpectedError
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
Expand Down Expand Up @@ -245,7 +246,7 @@ def validate_connector_settings(self) -> None:
test_repo.get_contents("")

except RateLimitExceededException:
raise ConnectorValidationError(
raise UnexpectedError(
"Validation failed due to GitHub rate-limits being exceeded. Please try again later."
)

Expand Down
1 change: 1 addition & 0 deletions backend/onyx/connectors/gmail/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ def _fetch_threads(
userId=user_email,
fields=THREAD_LIST_FIELDS,
q=query,
continue_on_404_or_403=True,
):
full_threads = execute_paginated_retrieval(
retrieval_function=gmail_service.users().threads().get,
Expand Down
97 changes: 79 additions & 18 deletions backend/onyx/connectors/web/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@
from onyx.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
from onyx.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS
from onyx.configs.constants import DocumentSource
from onyx.connectors.interfaces import ConnectorValidationError
from onyx.connectors.interfaces import CredentialExpiredError
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import InsufficientPermissionsError
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import UnexpectedError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.file_processing.extract_file_text import read_pdf_file
Expand Down Expand Up @@ -172,25 +176,34 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]:


def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
response = requests.get(sitemap_url)
response.raise_for_status()

soup = BeautifulSoup(response.content, "html.parser")
urls = [
_ensure_absolute_url(sitemap_url, loc_tag.text)
for loc_tag in soup.find_all("loc")
]

if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
# the given url doesn't look like a sitemap, let's try to find one
urls = list_pages_for_site(sitemap_url)

if len(urls) == 0:
raise ValueError(
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
)
try:
response = requests.get(sitemap_url)
response.raise_for_status()

return urls
soup = BeautifulSoup(response.content, "html.parser")
urls = [
_ensure_absolute_url(sitemap_url, loc_tag.text)
for loc_tag in soup.find_all("loc")
]

if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
# the given url doesn't look like a sitemap, let's try to find one
urls = list_pages_for_site(sitemap_url)

if len(urls) == 0:
raise ValueError(
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
)

return urls
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch sitemap from {sitemap_url}: {e}")
except ValueError as e:
raise RuntimeError(f"Error processing sitemap {sitemap_url}: {e}")
except Exception as e:
raise RuntimeError(
f"Unexpected error while processing sitemap {sitemap_url}: {e}"
)


def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
Expand Down Expand Up @@ -234,6 +247,7 @@ def __init__(
self.batch_size = batch_size
self.recursive = False
self.scroll_before_scraping = scroll_before_scraping
self.web_connector_type = web_connector_type

if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
self.recursive = True
Expand Down Expand Up @@ -419,6 +433,53 @@ def load_from_state(self) -> GenerateDocumentsOutput:
raise RuntimeError(last_error)
raise RuntimeError("No valid pages found.")

def validate_connector_settings(self) -> None:
# Make sure we have at least one valid URL to check
if not self.to_visit_list:
raise ConnectorValidationError(
"No URL configured. Please provide at least one valid URL."
)

if self.web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.SITEMAP.value:
return None

# We'll just test the first URL for connectivity and correctness
test_url = self.to_visit_list[0]

# Check that the URL is allowed and well-formed
try:
protected_url_check(test_url)
except ValueError as e:
raise ConnectorValidationError(
f"Protected URL check failed for '{test_url}': {e}"
)
except ConnectionError as e:
# Typically DNS or other network issues
raise ConnectorValidationError(str(e))

# Make a quick request to see if we get a valid response
try:
check_internet_connection(test_url)
except Exception as e:
err_str = str(e)
if "401" in err_str:
raise CredentialExpiredError(
f"Unauthorized access to '{test_url}': {e}"
)
elif "403" in err_str:
raise InsufficientPermissionsError(
f"Forbidden access to '{test_url}': {e}"
)
elif "404" in err_str:
raise ConnectorValidationError(f"Page not found for '{test_url}': {e}")
elif "Max retries exceeded" in err_str and "NameResolutionError" in err_str:
raise ConnectorValidationError(
f"Unable to resolve hostname for '{test_url}'. Please check the URL and your internet connection."
)
else:
# Could be a 5xx or another error, treat as unexpected
raise UnexpectedError(f"Unexpected error validating '{test_url}': {e}")


if __name__ == "__main__":
connector = WebConnector("https://docs.onyx.app/")
Expand Down
3 changes: 2 additions & 1 deletion backend/onyx/server/documents/cc_pair.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,8 @@ def associate_credential_to_connector(
logger.error(f"IntegrityError: {e}")
raise HTTPException(status_code=400, detail="Name must be unique")

except Exception:
except Exception as e:
logger.exception(f"Unexpected error: {e}")
raise HTTPException(status_code=500, detail="Unexpected error")


Expand Down
22 changes: 19 additions & 3 deletions backend/onyx/server/documents/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from onyx.configs.constants import MilestoneRecordType
from onyx.configs.constants import OnyxCeleryPriority
from onyx.configs.constants import OnyxCeleryTask
from onyx.connectors.factory import validate_ccpair_for_user
from onyx.connectors.google_utils.google_auth import (
get_google_oauth_creds,
)
Expand Down Expand Up @@ -61,6 +62,7 @@
from onyx.connectors.google_utils.shared_constants import (
GoogleOAuthAuthenticationMethod,
)
from onyx.connectors.interfaces import ConnectorValidationError
from onyx.db.connector import create_connector
from onyx.db.connector import delete_connector
from onyx.db.connector import fetch_connector_by_id
Expand Down Expand Up @@ -844,11 +846,22 @@ def create_connector_with_mock_credential(
db_session=db_session,
)

# Store the created connector and credential IDs
connector_id = cast(int, connector_response.id)
credential_id = credential.id

validate_ccpair_for_user(
connector_id=connector_id,
credential_id=credential_id,
db_session=db_session,
user=user,
tenant_id=tenant_id,
)
response = add_credential_to_connector(
db_session=db_session,
user=user,
connector_id=cast(int, connector_response.id), # will aways be an int
credential_id=credential.id,
connector_id=connector_id,
credential_id=credential_id,
access_type=connector_data.access_type,
cc_pair_name=connector_data.name,
groups=connector_data.groups,
Expand All @@ -873,9 +886,12 @@ def create_connector_with_mock_credential(
properties=None,
db_session=db_session,
)

return response

except ConnectorValidationError as e:
raise HTTPException(
status_code=400, detail="Connector validation error: " + str(e)
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))

Expand Down

0 comments on commit 99fc546

Please sign in to comment.