diff --git a/src/library/flatten.py b/src/library/flatten.py index f6ef1d7..b8bafb2 100644 --- a/src/library/flatten.py +++ b/src/library/flatten.py @@ -114,7 +114,7 @@ def _add_to_output(self, canonical_name, value, output): # Date time? if [x for x in self.CANONICAL_NAMES_WITH_DATE_TIMES if x in canonical_name]: try: - dt_object = dateutil.parser.parse(value) + dt_object = utils.parse_xsd_date_value(value) or dateutil.parser.parse(value) if dt_object: # This mirrors output of old flaterrer system value = dt_object.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]+"Z" diff --git a/src/library/utils.py b/src/library/utils.py index 6958700..82bf7c1 100644 --- a/src/library/utils.py +++ b/src/library/utils.py @@ -1,6 +1,8 @@ import chardet from library.logger import getLogger import hashlib +import re +import datetime logger = getLogger("utils") @@ -45,3 +47,64 @@ def get_hash_for_identifier(id): def chunk_list(l, n): for i in range(0, n): yield l[i::n] + + +class TimeZoneFixedOffset(datetime.tzinfo): + def __init__(self, hours, mins): + self.hours = hours + self.mins = mins + + def utcoffset(self, dt): + if self.hours > 0: + return datetime.timedelta(hours=self.hours, minutes=self.mins) + else: + return datetime.timedelta(hours=self.hours, minutes=(0-self.mins)) + + def tzname(self, dt): + return "UTC{hours:+02d}:{mins:02d}".format(hours=self.hours, mins=self.mins) + + def dst(self, dt): + return datetime.timedelta(0) + + +def parse_xsd_date_value(in_str): + """ + Takes in a string that may be a valid xsd:date value + + Returns a datetime object if it is (None is if it is not) + + Years larger than 9999 should work but Python won't let us. + See https://www.w3.org/TR/xmlschema-2/#date section 3.2.9.1 leading to section 3.2.7.1 + """ + # Date only + try: + v = datetime.datetime.strptime(in_str, "%Y-%m-%d") + if v: + return v + except ValueError: + pass + # Date and Z time zone + try: + v = datetime.datetime.strptime(in_str, "%Y-%m-%dZ") + if v: + return v + except ValueError: + pass + # Date and plus minus time zone + # We can't use %z as that works with -/+0000 + # and https://www.w3.org/TR/xmlschema-2/#dateTime section 3.2.7.3 defines -/+00:00 + try: + match = re.search(r'^(\d\d\d\d)-(\d\d)-(\d\d)(\-|\+)(\d\d):(\d\d)$', in_str) + if match: + tzinfo = TimeZoneFixedOffset( + int(match.group(5)) if match.group(4) == '+' else 0 - int(match.group(5)), + int(match.group(6)) + ) + dt = datetime.datetime(int(match.group(1)), int(match.group(2)), int(match.group(3)), tzinfo=tzinfo) + # We ignore the time zone part for now + return dt + except ValueError: + pass + # We fail + return None + diff --git a/src/tests/fixtures_flatten_flatterer/bad_dates.expected.json b/src/tests/fixtures_flatten_flatterer/bad_dates.expected.json index 80c20a1..f149763 100644 --- a/src/tests/fixtures_flatten_flatterer/bad_dates.expected.json +++ b/src/tests/fixtures_flatten_flatterer/bad_dates.expected.json @@ -1,14 +1,17 @@ [ { - "dataset_version": "", - "iati_identifier": "ACT-1" + "dataset_version": "2.03", + "iati_identifier": "big-year", + "activity_date_type": "1" }, { - "dataset_version": "", - "iati_identifier": "ACT-2" + "dataset_version": "2.03", + "iati_identifier": "rubbish-1", + "activity_date_type": "1" }, { - "dataset_version": "", - "iati_identifier": "ACT-3" + "dataset_version": "2.03", + "iati_identifier": "rubbish-2", + "activity_date_type": "1" } ] diff --git a/src/tests/fixtures_flatten_flatterer/bad_dates.input.xml b/src/tests/fixtures_flatten_flatterer/bad_dates.input.xml index fdaa9af..5e51ac9 100644 --- a/src/tests/fixtures_flatten_flatterer/bad_dates.input.xml +++ b/src/tests/fixtures_flatten_flatterer/bad_dates.input.xml @@ -1,11 +1,17 @@ - + + + - ACT-1 + big-year + - - ACT-2 + + + rubbish-1 + - - ACT-3 + + rubbish-2 + diff --git a/src/tests/fixtures_flatten_flatterer/date_format_timezones.expected.json b/src/tests/fixtures_flatten_flatterer/date_format_timezones.expected.json index 1a03725..a23b035 100644 --- a/src/tests/fixtures_flatten_flatterer/date_format_timezones.expected.json +++ b/src/tests/fixtures_flatten_flatterer/date_format_timezones.expected.json @@ -1,7 +1,12 @@ [ - { - "dataset_version": "2.03", - "iati_identifier": "ACT-1", - "last_updated_datetime": "2023-07-17T08:05:08.160Z" - } + { + "dataset_version": "2.03", + "last_updated_datetime": "2023-07-17T08:05:08.160Z", + "iati_identifier": "ACT-1" + }, + { + "dataset_version": "2.03", + "last_updated_datetime": "2023-07-17T00:00:00.000Z", + "iati_identifier": "ACT-2" + } ] diff --git a/src/tests/fixtures_flatten_flatterer/date_format_timezones.input.xml b/src/tests/fixtures_flatten_flatterer/date_format_timezones.input.xml index a5bedcc..37aa991 100644 --- a/src/tests/fixtures_flatten_flatterer/date_format_timezones.input.xml +++ b/src/tests/fixtures_flatten_flatterer/date_format_timezones.input.xml @@ -2,4 +2,7 @@ ACT-1 + + ACT-2 + diff --git a/src/tests/test_utils.py b/src/tests/test_utils.py index 1b93f12..fbf8ed1 100644 --- a/src/tests/test_utils.py +++ b/src/tests/test_utils.py @@ -1,5 +1,46 @@ -from library.utils import get_hash_for_identifier +from library.utils import get_hash_for_identifier, parse_xsd_date_value +import pytest def test_get_hash_for_identifier_1(): assert "9d989e8d27dc9e0ec3389fc855f142c3d40f0c50" == get_hash_for_identifier("cat") +PARSE_XSD_DATE_VALUE = [ + # just nonsense + ('cat', None), + # dates only + ('2023-11-15', '2023-11-15T00:00:00'), + # dates only ... that aren't valid + ('2023-13-15', None), + ('2023-00-15', None), + ('2023-01-32', None), + ('2023-02-30', None), + # dates and Z time zones + ('2023-11-15Z', '2023-11-15T00:00:00'), + # dates and Z time zones ... that aren't valid + ('2023-13-15Z', None), + ('2023-00-15Z', None), + ('2023-01-32Z', None), + ('2023-02-30Z', None), + # dates and offset time zones + ('2023-11-15+00:00', '2023-11-15T00:00:00+00:00'), + ('2023-11-15+01:00', '2023-11-15T00:00:00+01:00'), + ('2023-11-15+01:30', '2023-11-15T00:00:00+01:30'), + ('2023-11-15-00:00', '2023-11-15T00:00:00+00:00'), + ('2023-11-15-01:00', '2023-11-15T00:00:00-01:00'), + ('2023-11-15-01:30', '2023-11-15T00:00:00-01:30'), + # dates and offset time zones ... that aren't valid + ('2023-13-15-00:00', None), + ('2023-00-15-00:00', None), + ('2023-01-32-00:00', None), + ('2023-02-30-00:00', None), + # This should be valid in xsd:date but python can't handle years bigger than 9999 + ('10000-01-01', None), +] + +@pytest.mark.parametrize("in_value, expected_value", PARSE_XSD_DATE_VALUE) +def test_parse_xsd_date_value(in_value, expected_value): + actual = parse_xsd_date_value(in_value) + if expected_value: + assert actual.isoformat() == expected_value + else: + assert actual is None