From 8760c33a429ed813ccf57333c0e19edab2aa3e5a Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 24 Oct 2024 17:03:27 -0400 Subject: [PATCH] Handle level 2 fully-unspecified year --- examples/notebooks/edtf-support.ipynb | 81 +++++++------------ src/undate/dateformat/edtf/edtf.lark | 3 +- src/undate/undate.py | 7 +- .../test_dateformat/edtf/test_edtf_parser.py | 6 ++ .../edtf/test_edtf_transformer.py | 5 ++ tests/test_dateformat/test_edtf.py | 15 +++- 6 files changed, 59 insertions(+), 58 deletions(-) diff --git a/examples/notebooks/edtf-support.ipynb b/examples/notebooks/edtf-support.ipynb index c94e1ee..208b7dd 100644 --- a/examples/notebooks/edtf-support.ipynb +++ b/examples/notebooks/edtf-support.ipynb @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "9c6b7379-b2a7-4ec1-afa5-2cd9832c8a5d", "metadata": {}, "outputs": [], @@ -92,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "923476ff-344a-4018-a02e-6e5f80ea76a8", "metadata": {}, "outputs": [], @@ -159,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "6ed422de-34a2-4324-b254-f62db00563f7", "metadata": {}, "outputs": [], @@ -214,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "8d98a139-627b-40bd-b1c5-d0028e538a53", "metadata": {}, "outputs": [], @@ -259,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "532470db-851e-4f91-9242-cd93d35054cf", "metadata": {}, "outputs": [], @@ -324,7 +324,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "a5abd0e4-0b26-49b0-bf78-3e1fe6c046d8", "metadata": {}, "outputs": [], @@ -429,7 +429,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "e47f3fff-d35c-4c2e-9568-214763f6511a", "metadata": {}, "outputs": [], @@ -483,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "39143c1f-932a-450c-9b2d-ffbe3e1416b0", "metadata": {}, "outputs": [], @@ -537,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "95965f17-0bd5-446f-bc09-9503eaed68e2", "metadata": {}, "outputs": [], @@ -591,7 +591,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "c6c2d1a1-39f1-45eb-ac08-1de4fadbe842", "metadata": {}, "outputs": [], @@ -642,36 +642,16 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "f24fd31a-176a-40b5-bff4-d72b68f32a18", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1985\n" - ] - }, - { - "ename": "AssertionError", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[13], line 7\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# format\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(Undate(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1985\u001b[39m))\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(Undate(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1985\u001b[39m)) \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-1985\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "\u001b[0;31mAssertionError\u001b[0m: " - ] - } - ], + "outputs": [], "source": [ "# Example 1 ‘-1985’\n", "# parse\n", "neg_year = Undate.parse(\"-1985\", \"EDTF\")\n", "assert neg_year.year == \"-1985\"\n", "# format\n", - "print(Undate(-1985))\n", "assert str(Undate(-1985)) == \"-1985\"" ] }, @@ -710,7 +690,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "5910caab-eada-4715-b863-9bbbb15b9c5c", "metadata": {}, "outputs": [], @@ -739,20 +719,20 @@ "# format\n", "assert str(Undate(\"15XX\", 12, 25)) == \"15XX-12-25\"\n", "\n", - "# NOT CURRENTLY SUPPORTED\n", "# Example 3 ‘XXXX-12-XX’\n", "# parse\n", - "# december = Undate.parse(\"XXXX-12-XX\", \"EDTF\")\n", - "# assert december.year == \"XXXX\"\n", - "# assert december.month == \"12\"\n", - "# assert december.day == \"XX\"\n", - "# assert december.precision == DatePrecision.DAY\n", + "december = Undate.parse(\"XXXX-12-XX\", \"EDTF\")\n", + "assert december.year == \"XXXX\"\n", + "assert december.month == \"12\"\n", + "assert december.day == \"XX\"\n", + "assert december.precision == DatePrecision.DAY\n", + "# TODO: these must be in a different branch...\n", "# assert december.earliest.year == Undate.MIN_YEAR\n", "# assert december.latest.year == Undate.MAX_YEAR\n", - "# assert december.earliest.day == 1\n", - "# assert december.latest.day == 31\n", + "assert december.earliest.day == 1\n", + "assert december.latest.day == 31\n", "# format\n", - "# assert str(Undate(\"XXXX\", 12, \"XX\")) == \"XXXX-12-XX\"\n", + "assert str(Undate(\"XXXX\", 12, \"XX\")) == \"XXXX-12-XX\"\n", "\n", "# Example 4 '1XXX-XX’\n", "# parse\n", @@ -765,17 +745,16 @@ "# format\n", "assert str(Undate(\"1XXX\", \"XX\")) == \"1XXX-XX\"\n", "\n", - "# NOT CURRENTLY SUPPORTED (parse error)\n", "# Example 5 ‘1XXX-12’\n", "# parse\n", - "# some_december = Undate.parse(\"1XXX-12\", \"EDTF\")\n", - "# assert some_december.year == \"1XXX\"\n", - "# assert some_december.month == \"12\"\n", - "# assert some_december.precision == DatePrecision.MONTH\n", - "# assert some_december.earliest.year == 1000\n", - "# assert some_december.latest.year == 1999\n", - "# # format\n", - "# assert str(Undate(\"1XXX\", 12)) == \"1XXX-12\"\n", + "some_december = Undate.parse(\"1XXX-12\", \"EDTF\")\n", + "assert some_december.year == \"1XXX\"\n", + "assert some_december.month == \"12\"\n", + "assert some_december.precision == DatePrecision.MONTH\n", + "assert some_december.earliest.year == 1000\n", + "assert some_december.latest.year == 1999\n", + "# format\n", + "assert str(Undate(\"1XXX\", 12)) == \"1XXX-12\"\n", "\n", "# Example 6 ‘1984-1X’\n", "# parse\n", diff --git a/src/undate/dateformat/edtf/edtf.lark b/src/undate/dateformat/edtf/edtf.lark index 7ad92ec..f5b34f7 100644 --- a/src/undate/dateformat/edtf/edtf.lark +++ b/src/undate/dateformat/edtf/edtf.lark @@ -35,8 +35,9 @@ uncertain_approximate: "%" // The character 'X' may be used in place of one or more rightmost // digits to indicate that the value of that digit is unspecified +// In Level 2, year may be completely unspecified. unspecified: /X/ -?year_unspecified: /\d+/ unspecified+ +?year_unspecified: /\d+/ unspecified+ | unspecified ~ 4 ?month_unspecified: "0".."1"? unspecified ~ 1..2 //?year_month_unspecified: year_l1 "-" month_unspecified ?day_unspecified: "0".."3"? unspecified ~ 1..2 diff --git a/src/undate/undate.py b/src/undate/undate.py index 3a506d7..eb789ca 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -54,9 +54,12 @@ def __init__( elif year: self.precision = DatePrecision.YEAR - # TODO: refactor partial date min/max calculations + # special case: treat year = XXXX as unknown/none + if year == "XXXX": + year = None if year is not None: + # could we / should we use str.isnumeric here? try: year = int(year) # update initial value since it is used to determine @@ -110,7 +113,7 @@ def __init__( # if we have no day or partial day, calculate min / max min_day = 1 # if we know year and month (or max month), calculate exactly - if year and month: + if year and month and isinstance(year, int): _, max_day = monthrange(int(year), max_month) elif year is None and month: # If we don't have year and month, diff --git a/tests/test_dateformat/edtf/test_edtf_parser.py b/tests/test_dateformat/edtf/test_edtf_parser.py index 06cd5b3..c8c05e8 100644 --- a/tests/test_dateformat/edtf/test_edtf_parser.py +++ b/tests/test_dateformat/edtf/test_edtf_parser.py @@ -32,6 +32,12 @@ "1985-04/..", "../1985-04-12", "/1985-04-12", + # level 2 unspecified digits + "156X-12-25", + "XXXX-12-XX", + "1XXX-12", + "1XXX-XX", + "1984-1X", ] diff --git a/tests/test_dateformat/edtf/test_edtf_transformer.py b/tests/test_dateformat/edtf/test_edtf_transformer.py index 8de09d9..c29bb70 100644 --- a/tests/test_dateformat/edtf/test_edtf_transformer.py +++ b/tests/test_dateformat/edtf/test_edtf_transformer.py @@ -34,6 +34,11 @@ ("1985-04/..", UndateInterval(Undate(1985, 4), None)), ("../1985-04-12", UndateInterval(None, Undate(1985, 4, 12))), ("/1985-04-12", UndateInterval(None, Undate(1985, 4, 12))), + # level 2 unspecified digits + ("156X-12-25", Undate("156X", 12, 25)), + ("XXXX-12-XX", Undate("XXXX", 12, "XX")), + ("1XXX-XX", Undate("1XXX", "XX")), + ("1984-1X", Undate(1984, "1X")), ] diff --git a/tests/test_dateformat/test_edtf.py b/tests/test_dateformat/test_edtf.py index a70f891..4ae2fe9 100644 --- a/tests/test_dateformat/test_edtf.py +++ b/tests/test_dateformat/test_edtf.py @@ -13,15 +13,18 @@ def test_parse_singledate(self): assert str(EDTFDateFormat().parse("201X")) == str(Undate("201X")) assert str(EDTFDateFormat().parse("2004-XX")) == str(Undate(2004, "XX")) # missing year but month/day known - # assert EDTFDateFormat().parse("--05-03") == Undate(month=5, day=3) + # comparison doesn't work because undate knows unknown dates aren't + # necessarily the same, so use string comparison + assert str(EDTFDateFormat().parse("XXXX-05-03")) == Undate( + month=5, day=3 + ).format("EDTF") def test_parse_singledate_unequal(self): assert EDTFDateFormat().parse("2002") != Undate(2003) assert EDTFDateFormat().parse("1991-05") != Undate(1991, 6) assert EDTFDateFormat().parse("1991-05-03") != Undate(1991, 5, 4) # missing year but month/day known - # - does EDTF not support this or is parsing logic incorrect? - # assert EDTFDateFormat().parse("XXXX-05-03") != Undate(month=5, day=4) + assert EDTFDateFormat().parse("XXXX-05-03") != Undate(month=5, day=4) def test_parse_invalid(self): with pytest.raises(ValueError): @@ -47,4 +50,8 @@ def test_to_string(self): assert EDTFDateFormat().to_string(Undate(1991, "0X")) == "1991-0X" assert EDTFDateFormat().to_string(Undate(1991, None, 3)) == "1991-XX-03" - # TODO: override missing digit and confirm replacement + # level 2 unspecified digits + assert EDTFDateFormat().to_string(Undate("156X", 12, 25)) == "156X-12-25" + assert EDTFDateFormat().to_string(Undate("XXXX", 12, "XX")) == "XXXX-12-XX" + assert EDTFDateFormat().to_string(Undate("1XXX", "XX")) == "1XXX-XX" + assert EDTFDateFormat().to_string(Undate(1984, "1X")) == "1984-1X"