Skip to content

Commit

Permalink
Make EDTF parser available as undate formatter; handle 5+ digit years
Browse files Browse the repository at this point in the history
  • Loading branch information
rlskoeser committed Aug 16, 2024
1 parent 329fa3d commit 418837f
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 10 deletions.
26 changes: 26 additions & 0 deletions src/undate/dateformat/edtf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from typing import Dict, List, Union

from lark.exceptions import UnexpectedCharacters

from undate.undate import Undate, UndateInterval
from undate.dateformat.base import BaseDateFormat
from undate.dateformat.edtf.parser import edtf_parser
from undate.dateformat.edtf.transformer import EDTFTransformer


class EDTFDateFormat(BaseDateFormat):
name: str = "EDTF"

def __init__(self):
self.transformer = EDTFTransformer()

def parse(self, value: str) -> Union[Undate, UndateInterval]:
# parse the input string, then transform to undate object
try:
parsetree = edtf_parser.parse(value)
return self.transformer.transform(parsetree)
except UnexpectedCharacters as err:
raise ValueError("Parsing failed due to UnexpectedCharacters: %s" % err)

# def to_string(self, undate: Undate) -> str:
# TODO: how do we leverage the parser for this?
8 changes: 4 additions & 4 deletions src/undate/dateformat/edtf/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def day_unspecified(self, items):
def date_level1(self, items):
return self.date(items)

def year_fivedigitsplus(self, token):
def year_fivedigitsplus(self, items):
# strip off the leading Y and convert to integer
# TODO: undate is currently limited to 4-digit years
# (datetime max year of 9999)
return tok.update(int(token[:1]))
token = items[0]
year = int(token.value.lstrip("Y"))
return Tree(data="year", children=[year])
3 changes: 2 additions & 1 deletion src/undate/dateformat/iso8601.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Dict, List, Union

from undate.dateformat.base import BaseDateFormat
from undate.undate import Undate, UndateInterval
from typing import Dict, List, Union


class ISO8601DateFormat(BaseDateFormat):
Expand Down
8 changes: 3 additions & 5 deletions tests/test_dateformat/edtf/test_edtf_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@
("1001-03-30", Undate(1001, 3, 30)),
("1000/2000", UndateInterval(Undate(1000), Undate(2000))),
("1000-01/2000-05-01", UndateInterval(Undate(1000, 1), Undate(2000, 5, 1))),
# # level 1
# NOTE: undate currently doesn't most of the level 1 functionality
# NOTE: undate currently doesn't support years beyond 9999 (datetime.MAXYEAR)
# ("Y17000002", Undate(17000002)),
# level 1
("Y17000002", Undate(17000002)),
# "2001-21", # spring 2001
# # qualifiers
# qualifiers TODO - not yet supported by undate
# "1984?",
# "2004-06~",
# "2004-06-11%",
Expand Down
39 changes: 39 additions & 0 deletions tests/test_dateformat/test_edtf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pytest

from undate.dateformat.edtf import EDTFDateFormat
from undate.undate import Undate, UndateInterval


class TestEDTFDateFormat:
def test_parse_singledate(self):
assert EDTFDateFormat().parse("2002") == Undate(2002)
assert EDTFDateFormat().parse("1991-05") == Undate(1991, 5)
assert EDTFDateFormat().parse("1991-05-03") == Undate(1991, 5, 3)
# unknown dates are not strictly equal, but string comparison should match
assert str(EDTFDateFormat().parse("201X")) == str(Undate("201X"))
assert str(EDTFDateFormat().parse("2004-XX")) == str(Undate(2004, "XX"))
# missing year but month/day known
# assert EDTFDateFormat().parse("--05-03") == Undate(month=5, day=3)

def test_parse_singledate_unequal(self):
assert EDTFDateFormat().parse("2002") != Undate(2003)
assert EDTFDateFormat().parse("1991-05") != Undate(1991, 6)
assert EDTFDateFormat().parse("1991-05-03") != Undate(1991, 5, 4)
# missing year but month/day known
# - does EDTF not support this or is parsing logic incorrect?
# assert EDTFDateFormat().parse("XXXX-05-03") != Undate(month=5, day=4)

def test_parse_invalid(self):
with pytest.raises(ValueError):
assert EDTFDateFormat().parse("1991-5") == Undate(1991, 5)

def test_parse_range(self):
assert EDTFDateFormat().parse("1800/1900") == UndateInterval(
Undate(1800), Undate(1900)
)

# def test_to_string(self):
# # NOTE: iso8601 to_string currently tested more thoroughly
# # in undate str tests; may want to move those tests here
# assert EDTFDateFormat().to_string(Undate(900)) == "0900"
# assert EDTFDateFormat().to_string(Undate(33)) == "0033"

0 comments on commit 418837f

Please sign in to comment.