Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add special case to time_str_to_utc to handle invalid tz names #3922

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import re
import string
from collections.abc import Callable
from collections.abc import Iterator
from datetime import datetime
from datetime import timezone
from typing import TypeVar

from dateutil.parser import parse
from dateutil.parser import parserinfo

from onyx.configs.app_configs import CONNECTOR_LOCALHOST_OVERRIDE
from onyx.configs.constants import IGNORE_FOR_QA
Expand All @@ -16,6 +19,14 @@
U = TypeVar("U")


def _is_valid_tzname(tz_name_str: str) -> bool:
# based on dateutil.parser _could_be_tzname
return len(tz_name_str) <= 5 and (
all(x in string.ascii_uppercase for x in tz_name_str)
or tz_name_str in parserinfo.UTCZONE
)


def datetime_to_utc(dt: datetime) -> datetime:
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
dt = dt.replace(tzinfo=timezone.utc)
Expand All @@ -32,6 +43,22 @@ def time_str_to_utc(datetime_str: str) -> datetime:
# Convert "0000" to "+0000" for proper timezone parsing
fixed_dt_str = datetime_str.replace(" 0000", " +0000")
dt = parse(fixed_dt_str)
elif (
len(
matches := list(
match
for match in re.findall(r"\+\d{4} \(([^)]*)\)", datetime_str)
if not _is_valid_tzname(match)
)
)
== 1
):
# Where a string contains both an offset AND a timezone name BUT the name is invalid: remove the name
# e.g.
# +0300 (+03) -> +0300
# +1100 (AUSNSW) -> +1100
fixed_dt_str = datetime_str.replace(f" ({matches[0]})", "")
dt = parse(fixed_dt_str)
else:
raise
return datetime_to_utc(dt)
Expand Down
14 changes: 14 additions & 0 deletions backend/tests/unit/onyx/connectors/gmail/test_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_build_time_range_query() -> None:

def test_time_str_to_utc() -> None:
str_to_dt = {
# well-formed strings:
"Tue, 5 Oct 2021 09:38:25 GMT": datetime.datetime(
2021, 10, 5, 9, 38, 25, tzinfo=datetime.timezone.utc
),
Expand All @@ -57,6 +58,19 @@ def test_time_str_to_utc() -> None:
"22 Mar 2020 20:12:18 +0000 (GMT)": datetime.datetime(
2020, 3, 22, 20, 12, 18, tzinfo=datetime.timezone.utc
),
# malformed strings, which should be fixed automatically:
# 0000 corrected to +0000
"22 Mar 2020 20:12:18 0000": datetime.datetime(
2020, 3, 22, 20, 12, 18, tzinfo=datetime.timezone.utc
),
# invalid (+03) removed
"Thu, 23 Jan 2025 11:04:48 +0300 (+03)": datetime.datetime(
2025, 1, 23, 8, 4, 48, tzinfo=datetime.timezone.utc
),
# invalid (AUSNSW) removed
"Sat, 25 Jan 2025 05:15:30 +1100 (AUSNSW)": datetime.datetime(
2025, 1, 24, 18, 15, 30, tzinfo=datetime.timezone.utc
),
}
for strptime, expected_datetime in str_to_dt.items():
assert time_str_to_utc(strptime) == expected_datetime
Loading