Merge pull request #65 from dh-tech/feature/5-date-comparisons

preliminary date comparison methods
dh-tech · Apr 25, 2024 · 1cef86b · 1cef86b
2 parents ab378da + fcdd4c2
commit 1cef86b
Show file tree

Hide file tree

Showing 3 changed files with 257 additions and 19 deletions.
diff --git a/src/undate/undate.py b/src/undate/undate.py
@@ -1,6 +1,6 @@
 import datetime
 from calendar import monthrange
-from enum import Enum, auto
+from enum import IntEnum
 import re
 
 # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None
@@ -15,16 +15,22 @@
 ONE_DAY = datetime.timedelta(days=1)
 
 
-class DatePrecision(Enum):
+class DatePrecision(IntEnum):
     """date precision, to indicate date precision independent from how much
     of the date is known."""
 
-    #: year
-    YEAR = auto()
-    #: month
-    MONTH = auto()
+    # numbers should be set to allow logical greater than / less than
+    # comparison, e.g. year precision > month
+
     #: day
-    DAY = auto()
+    DAY = 1
+    #: month
+    MONTH = 2
+    #: year
+    YEAR = 3
+
+    def __str__(self):
+        return f"{self.name}"
 
 
 class Undate:
@@ -173,18 +179,93 @@ def __repr__(self) -> str:
             return "<Undate '%s' (%s)>" % (self.label, self)
         return "<Undate %s>" % self
 
-    def __eq__(self, other: "Undate") -> bool:
-        # question: should label be taken into account when checking equality?
-        # for now, assuming label differences don't matter for comparing dates
-        return (
+    def __eq__(self, other: Union["Undate", datetime.date]) -> bool:
+        # Note: assumes label differences don't matter for comparing dates
+
+        # only a day-precision fully known undate can be equal to a datetime.date
+        if isinstance(other, datetime.date):
+            return self.earliest == other and self.latest == other
+
+        # check for apparent equality
+        looks_equal = (
             self.earliest == other.earliest
             and self.latest == other.latest
-            # NOTE: assumes that partially known values can only be written
-            # in one format (i.e. X for missing digits).
-            # If we support other formats, will need to normalize to common
-            # internal format for comparison
             and self.initial_values == other.initial_values
         )
+        # if everything looks the same, check for any unknowns in initial values
+        # the same unknown date should NOT be considered equal
+
+        # NOTE: assumes that partially known values can only be written
+        # in one format (i.e. X for missing digits).
+        # If we support other formats, will need to normalize to common
+        # internal format for comparison
+        if looks_equal and any("X" in str(val) for val in self.initial_values.values()):
+            return False
+        return looks_equal
+
+    def __lt__(self, other: Union["Undate", datetime.date]) -> bool:
+        # support datetime.date by converting to undate
+        if isinstance(other, datetime.date):
+            other = Undate.from_datetime_date(other)
+
+        # if this date ends before the other date starts,
+        # return true (this date is earlier, so it is less)
+        if self.latest < other.earliest:
+            return True
+
+        # if the other one ends before this one starts,
+        # return false (this date is later, so it is not less)
+        if other.latest < self.earliest:
+            return False
+
+        # if it does not, check if one is included within the other
+        # (e.g., single date within the same year)
+        # comparison for those cases is not currently supported
+        elif other in self or self in other:
+            raise NotImplementedError(
+                "Can't compare when one date falls within the other"
+            )
+        # NOTE: unsupported comparisons are supposed to return NotImplemented
+        # However, doing that in this case results in a confusing TypeError!
+        #   TypeError: '<' not supported between instances of 'Undate' and 'Undate'
+        # How to handle when the comparison is ambiguous / indeterminate?
+        # we may need a tribool / ternary type (true, false, unknown),
+        # but not sure what python builtin methods will do with it (unknown = false?)
+
+        # for any other case (i.e., self == other), return false
+        return False
+
+    def __gt__(self, other: Union["Undate", datetime.date]) -> bool:
+        # define gt ourselves so we can support > comparison with datetime.date,
+        # but rely on existing less than implementation.
+        # strictly greater than must rule out equals
+        return not (self < other or self == other)
+
+    def __le__(self, other: Union["Undate", datetime.date]) -> bool:
+        return self == other or self < other
+
+    def __contains__(self, other: Union["Undate", datetime.date]) -> bool:
+        # if the two dates are strictly equal, don't consider
+        # either one as containing the other
+
+        # support comparison with datetime by converting to undate
+        if isinstance(other, datetime.date):
+            other = Undate.from_datetime_date(other)
+
+        if self == other:
+            return False
+
+        return (
+            self.earliest <= other.earliest
+            and self.latest >= other.latest
+            # is precision sufficient for comparing partially known dates?
+            and self.precision > other.precision
+        )
+
+    @staticmethod
+    def from_datetime_date(dt_date):
+        """Initialize an :class:`Undate` object from a :class:`datetime.date`"""
+        return Undate(dt_date.year, dt_date.month, dt_date.day)
 
     @property
     def known_year(self) -> bool:

diff --git a/tests/test_dateformat/test_base.py b/tests/test_dateformat/test_base.py
@@ -31,9 +31,12 @@ def test_parse_to_string(self):
             BaseDateFormat().to_string(1991)
 
 
-@pytest.mark.first
 def test_import_formatters_import_only_once(caplog):
-    # run first so we can confirm it runs once
+    # clear the cache, since any instantiation of an Undate
+    # object anywhere in the test suite will populate it
+    BaseDateFormat.import_formatters.cache_clear()
+
+    # run first, and confirm it runs and loads formatters
     with caplog.at_level(logging.DEBUG):
         import_count = BaseDateFormat.import_formatters()
     # should import at least one thing (iso8601)

diff --git a/tests/test_undate.py b/tests/test_undate.py
@@ -1,8 +1,13 @@
-from datetime import timedelta
+from datetime import timedelta, date
 
 import pytest
 
-from undate.undate import Undate, UndateInterval
+from undate.undate import Undate, UndateInterval, DatePrecision
+
+
+class TestDatePrecision:
+    def test_str(self):
+        assert str(DatePrecision.YEAR) == "YEAR"
 
 
 class TestUndate:
@@ -121,12 +126,27 @@ def test_invalid_date(self):
         with pytest.raises(ValueError):
             Undate(1990, 22)
 
+    def test_from_datetime_date(self):
+        undate_from_date = Undate.from_datetime_date(date(2001, 3, 5))
+        assert isinstance(undate_from_date, Undate)
+        assert undate_from_date == Undate(2001, 3, 5)
+
     def test_eq(self):
         assert Undate(2022) == Undate(2022)
         assert Undate(2022, 10) == Undate(2022, 10)
         assert Undate(2022, 10, 1) == Undate(2022, 10, 1)
         assert Undate(month=2, day=7) == Undate(month=2, day=7)
 
+    def test_eq_datetime_date(self):
+        # support comparisons with datetime objects for full day-precision
+        assert Undate(2022, 10, 1) == date(2022, 10, 1)
+        assert Undate(2022, 10, 1) != date(2022, 10, 2)
+        assert Undate(1980, 10, 1) != date(2022, 10, 1)
+
+        # other date precisions are not equal
+        assert Undate(2022) != date(2022, 10, 1)
+        assert Undate(2022, 10) != date(2022, 10, 1)
+
     def test_not_eq(self):
         assert Undate(2022) != Undate(2023)
         assert Undate(2022, 10) != Undate(2022, 11)
@@ -135,6 +155,140 @@ def test_not_eq(self):
         assert Undate(2022) != Undate(2022, 10)
         assert Undate(2022, 10) != Undate(2022, 10, 1)
 
+        # partially unknown dates should NOT be considered equal
+        assert Undate("19XX") != Undate("19XX")
+        assert Undate(1980, "XX") != Undate(1980, "XX")
+
+    testdata_lt_gt = [
+        # dates to test for gt/lt comparison: earlier date, later date
+        # - simple cases: same precision where one date is clearly earlier
+        (Undate(2022), Undate(2023)),
+        (Undate(1991, 1), Undate(1991, 5)),
+        (Undate(1856, 3, 3), Undate(1856, 3, 21)),
+        # - mixed precision where one date is clearly earlier
+        (Undate(1991, 1), Undate(2000)),
+        (Undate(1856, 3, 3), Undate(1901)),
+        # partially known digits where comparison is possible
+        (Undate("19XX"), Undate("20XX")),
+        (Undate(1900, "0X"), Undate(1900, "1X")),
+        # compare with datetime.date objects
+        (Undate("19XX"), date(2020, 1, 1)),
+        (Undate(1991, 1), date(1992, 3, 4)),
+    ]
+
+    @pytest.mark.parametrize("earlier,later", testdata_lt_gt)
+    def test_lt(self, earlier, later):
+        assert earlier < later
+        assert later > earlier
+
+    testdata_lte_gte = testdata_lt_gt.copy()
+    # add a few exactly equal cases
+    testdata_lte_gte.extend(
+        [
+            (Undate(1601), Undate(1601)),
+            (Undate(1991, 1), Undate(1991, 1)),
+            (Undate(1492, 5, 3), Undate(1492, 5, 3)),
+            # compare with datetime.date also
+            (Undate(1492, 5, 3), date(1492, 5, 3)),
+        ]
+    )
+
+    def test_lt_when_eq(self):
+        # strict less than / greater should return false when equal
+        assert not Undate(1900) > Undate(1900)
+        assert not Undate(1900) < Undate(1900)
+        # same for datetime.date
+        assert not Undate(1903, 1, 5) < date(1903, 1, 5)
+        assert not Undate(1903, 1, 5) > date(1903, 1, 5)
+
+    @pytest.mark.parametrize("earlier,later", testdata_lte_gte)
+    def test_lte(self, earlier, later):
+        assert earlier <= later
+        assert later >= earlier
+
+    def test_lt_notimplemented(self):
+        # how to compare mixed precision where dates overlap?
+        # if the second date falls *within* earliest/latest,
+        # then it is not clearly less; not implemented?
+        with pytest.raises(NotImplementedError, match="date falls within the other"):
+            assert Undate(2022) < Undate(2022, 5)
+
+        # same if we attempt to compare in the other direction
+        with pytest.raises(NotImplementedError, match="date falls within the other"):
+            assert Undate(2022, 5) < Undate(2022)
+
+    testdata_contains = [
+        # first date falls within the range of the other
+        # dates within range: middle, start, end, varying precision
+        (Undate(2022, 6), Undate(2022)),
+        (Undate(2022, 1, 1), Undate(2022)),
+        (Undate(2022, 12, 31), Undate(2022)),
+        (Undate(2022, 6, 15), Undate(2022, 6)),
+        # support contains with datetime.date
+        (date(2022, 6, 1), Undate(2022)),
+        (date(2022, 6, 1), Undate(2022, 6)),
+    ]
+
+    @pytest.mark.parametrize("date1,date2", testdata_contains)
+    def test_contains(self, date1, date2):
+        assert date1 in date2
+
+    testdata_not_contains = [
+        # dates not in range
+        (Undate(1980), Undate(2020)),
+        (Undate(1980), Undate(2020, 6)),
+        (Undate(1980, 6), Undate(2020, 6)),
+        # support contains with datetime.date
+        (date(1980, 6, 1), Undate(2022)),
+        (date(3001, 6, 1), Undate(2022, 6)),
+        # partially known dates that are similar but same precision,
+        # so one does not contain the other
+        (Undate("199X"), Undate("19XX")),
+        # - specific month to unknown month
+        (Undate(1980, 6), Undate(1980, "XX")),
+        # some of these might overlap, but we don't have enough
+        # information to determine
+        # - unknown month to unknown month
+        (Undate(1980, "XX"), Undate(1980, "XX")),
+        # - partially unknown month to unknown month
+        (Undate(1801, "1X"), Undate(1801, "XX")),
+    ]
+
+    @pytest.mark.parametrize("date1,date2", testdata_not_contains)
+    def test_not_contains(self, date1, date2):
+        assert date1 not in date2
+
+    def test_sorting(self):
+        # sorting should be possible based on gt/lt
+        # test simple cases for sorting
+        d1980 = Undate(1980)
+        d2002_10 = Undate(2002, 10)
+        d2002_12 = Undate(2002, 12)
+        d2012_05_01 = Undate(2012, 5, 1)
+
+        assert sorted([d2012_05_01, d2002_12, d2002_10, d1980]) == [
+            d1980,
+            d2002_10,
+            d2002_12,
+            d2012_05_01,
+        ]
+
+        # what about semi-ambigous cases?
+        d1991_XX = Undate(1991, "XX")
+        d1992_01_XX = Undate(1992, 1, "XX")
+        assert sorted([d1992_01_XX, d1991_XX, d1980]) == [d1980, d1991_XX, d1992_01_XX]
+
+        # what about things we can't compare?
+        d1991 = Undate(1991)
+        d1991_02 = Undate(1991, 2)
+        # for now, this will raise a not implemented error
+        with pytest.raises(NotImplementedError):
+            sorted([d1991_02, d1991, d1991_XX])
+
+        # TODO: partially known year?
+        # someyear = Undate("1XXX")
+        # assert sorted([d1991, someyear]) == [someyear, d1991]
+
     def test_duration(self):
         day_duration = Undate(2022, 11, 7).duration()
         assert isinstance(day_duration, timedelta)