Skip to content

Commit

Permalink
Merge pull request #298 from IATI/2023-11-15
Browse files Browse the repository at this point in the history
Add parse_xsd_date_value - use in Flattener
  • Loading branch information
James (ODSC) authored Nov 15, 2023
2 parents 59d6e5b + 7941c58 commit faee4b5
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 19 deletions.
2 changes: 1 addition & 1 deletion src/library/flatten.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def _add_to_output(self, canonical_name, value, output):
# Date time?
if [x for x in self.CANONICAL_NAMES_WITH_DATE_TIMES if x in canonical_name]:
try:
dt_object = dateutil.parser.parse(value)
dt_object = utils.parse_xsd_date_value(value) or dateutil.parser.parse(value)
if dt_object:
# This mirrors output of old flaterrer system
value = dt_object.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]+"Z"
Expand Down
63 changes: 63 additions & 0 deletions src/library/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import chardet
from library.logger import getLogger
import hashlib
import re
import datetime

logger = getLogger("utils")

Expand Down Expand Up @@ -45,3 +47,64 @@ def get_hash_for_identifier(id):
def chunk_list(l, n):
for i in range(0, n):
yield l[i::n]


class TimeZoneFixedOffset(datetime.tzinfo):
def __init__(self, hours, mins):
self.hours = hours
self.mins = mins

def utcoffset(self, dt):
if self.hours > 0:
return datetime.timedelta(hours=self.hours, minutes=self.mins)
else:
return datetime.timedelta(hours=self.hours, minutes=(0-self.mins))

def tzname(self, dt):
return "UTC{hours:+02d}:{mins:02d}".format(hours=self.hours, mins=self.mins)

def dst(self, dt):
return datetime.timedelta(0)


def parse_xsd_date_value(in_str):
"""
Takes in a string that may be a valid xsd:date value
Returns a datetime object if it is (None is if it is not)
Years larger than 9999 should work but Python won't let us.
See https://www.w3.org/TR/xmlschema-2/#date section 3.2.9.1 leading to section 3.2.7.1
"""
# Date only
try:
v = datetime.datetime.strptime(in_str, "%Y-%m-%d")
if v:
return v
except ValueError:
pass
# Date and Z time zone
try:
v = datetime.datetime.strptime(in_str, "%Y-%m-%dZ")
if v:
return v
except ValueError:
pass
# Date and plus minus time zone
# We can't use %z as that works with -/+0000
# and https://www.w3.org/TR/xmlschema-2/#dateTime section 3.2.7.3 defines -/+00:00
try:
match = re.search(r'^(\d\d\d\d)-(\d\d)-(\d\d)(\-|\+)(\d\d):(\d\d)$', in_str)
if match:
tzinfo = TimeZoneFixedOffset(
int(match.group(5)) if match.group(4) == '+' else 0 - int(match.group(5)),
int(match.group(6))
)
dt = datetime.datetime(int(match.group(1)), int(match.group(2)), int(match.group(3)), tzinfo=tzinfo)
# We ignore the time zone part for now
return dt
except ValueError:
pass
# We fail
return None

15 changes: 9 additions & 6 deletions src/tests/fixtures_flatten_flatterer/bad_dates.expected.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
[
{
"dataset_version": "",
"iati_identifier": "ACT-1"
"dataset_version": "2.03",
"iati_identifier": "big-year",
"activity_date_type": "1"
},
{
"dataset_version": "",
"iati_identifier": "ACT-2"
"dataset_version": "2.03",
"iati_identifier": "rubbish-1",
"activity_date_type": "1"
},
{
"dataset_version": "",
"iati_identifier": "ACT-3"
"dataset_version": "2.03",
"iati_identifier": "rubbish-2",
"activity_date_type": "1"
}
]
18 changes: 12 additions & 6 deletions src/tests/fixtures_flatten_flatterer/bad_dates.input.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
<iati-activities>
<iati-activities version="2.03">
<!-- last-updated-datetime is xsd:dateTime. activity-date iso-date is xsd:date - test separately in case parsing is different! -->
<!-- test case - actually years bigger than 10000 should be allowed but Python won't let us. Don't crash. -->
<iati-activity last-updated-datetime="12021-11-05">
<iati-identifier>ACT-1</iati-identifier>
<iati-identifier>big-year</iati-identifier>
<activity-date iso-date="12021-11-05" type="1" />
</iati-activity>
<iati-activity last-updated-datetime="2006-04-23+09:00">
<iati-identifier>ACT-2</iati-identifier>
<!-- test case - various rubbish won't crash -->
<iati-activity last-updated-datetime="cat">
<iati-identifier>rubbish-1</iati-identifier>
<activity-date iso-date="cat" type="1" />
</iati-activity>
<iati-activity last-updated-datetime="2010-01-01Z">
<iati-identifier>ACT-3</iati-identifier>
<iati-activity last-updated-datetime="2023-15-40">
<iati-identifier>rubbish-2</iati-identifier>
<activity-date iso-date="2023-15-40" type="1" />
</iati-activity>
</iati-activities>
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
[
{
"dataset_version": "2.03",
"iati_identifier": "ACT-1",
"last_updated_datetime": "2023-07-17T08:05:08.160Z"
}
{
"dataset_version": "2.03",
"last_updated_datetime": "2023-07-17T08:05:08.160Z",
"iati_identifier": "ACT-1"
},
{
"dataset_version": "2.03",
"last_updated_datetime": "2023-07-17T00:00:00.000Z",
"iati_identifier": "ACT-2"
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@
<iati-activity last-updated-datetime="2023-07-17T08:05:08.160000+00:00">
<iati-identifier>ACT-1</iati-identifier>
</iati-activity>
<iati-activity last-updated-datetime="2023-07-17+00:00">
<iati-identifier>ACT-2</iati-identifier>
</iati-activity>
</iati-activities>
43 changes: 42 additions & 1 deletion src/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,46 @@
from library.utils import get_hash_for_identifier
from library.utils import get_hash_for_identifier, parse_xsd_date_value
import pytest

def test_get_hash_for_identifier_1():
assert "9d989e8d27dc9e0ec3389fc855f142c3d40f0c50" == get_hash_for_identifier("cat")

PARSE_XSD_DATE_VALUE = [
# just nonsense
('cat', None),
# dates only
('2023-11-15', '2023-11-15T00:00:00'),
# dates only ... that aren't valid
('2023-13-15', None),
('2023-00-15', None),
('2023-01-32', None),
('2023-02-30', None),
# dates and Z time zones
('2023-11-15Z', '2023-11-15T00:00:00'),
# dates and Z time zones ... that aren't valid
('2023-13-15Z', None),
('2023-00-15Z', None),
('2023-01-32Z', None),
('2023-02-30Z', None),
# dates and offset time zones
('2023-11-15+00:00', '2023-11-15T00:00:00+00:00'),
('2023-11-15+01:00', '2023-11-15T00:00:00+01:00'),
('2023-11-15+01:30', '2023-11-15T00:00:00+01:30'),
('2023-11-15-00:00', '2023-11-15T00:00:00+00:00'),
('2023-11-15-01:00', '2023-11-15T00:00:00-01:00'),
('2023-11-15-01:30', '2023-11-15T00:00:00-01:30'),
# dates and offset time zones ... that aren't valid
('2023-13-15-00:00', None),
('2023-00-15-00:00', None),
('2023-01-32-00:00', None),
('2023-02-30-00:00', None),
# This should be valid in xsd:date but python can't handle years bigger than 9999
('10000-01-01', None),
]

@pytest.mark.parametrize("in_value, expected_value", PARSE_XSD_DATE_VALUE)
def test_parse_xsd_date_value(in_value, expected_value):
actual = parse_xsd_date_value(in_value)
if expected_value:
assert actual.isoformat() == expected_value
else:
assert actual is None

0 comments on commit faee4b5

Please sign in to comment.