diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py index ecdbf9b..14bff87 100644 --- a/src/undate/converters/base.py +++ b/src/undate/converters/base.py @@ -28,6 +28,7 @@ formatter methods as desired/appropriate for your converter as well as the additional methods for ``max_month``, ``max_day``, and convertion ``to_gregorian`` calendar. +- Import your calendar in ``undate/converters/calendars/__init__.py`` and include in `__all__`` - Add unit tests for the new calendar logic under ``tests/test_converters/calendars/`` - Add the new calendar to the ``Calendar`` enum of supported calendars in ``undate/undate.py`` and confirm that the `get_converter` method loads your @@ -136,9 +137,13 @@ class BaseCalendarConverter(BaseDateConverter): #: Converter name. Subclasses must define a unique name. name: str = "Base Calendar Converter" - def max_month(self, year: int) -> int: - """Maximum month for this calendar for this year""" - raise NotImplementedError + def min_month(self) -> int: + """First month for this calendar. Defaults to 1.""" + return 1 + + def max_month(self) -> int: + """Last month for this calendar. Defaults to 12.""" + return 12 def max_day(self, year: int, month: int) -> int: """maximum numeric day for the specified year and month in this calendar""" diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian.py index f794329..9a3e2a9 100644 --- a/src/undate/converters/calendars/gregorian.py +++ b/src/undate/converters/calendars/gregorian.py @@ -15,10 +15,6 @@ class GregorianDateConverter(BaseCalendarConverter): #: known non-leap year NON_LEAP_YEAR: int = 2022 - def max_month(self, year: int) -> int: - """Maximum month for this calendar for this year""" - return 12 - def max_day(self, year: int, month: int) -> int: """maximum numeric day for the specified year and month in this calendar""" # if month is known, use that to calculate diff --git a/src/undate/converters/calendars/hebrew/__init__.py b/src/undate/converters/calendars/hebrew/__init__.py new file mode 100644 index 0000000..4ac5b4b --- /dev/null +++ b/src/undate/converters/calendars/hebrew/__init__.py @@ -0,0 +1,3 @@ +from undate.converters.calendars.hijri.converter import HijriDateConverter + +__all__ = ["HijriDateConverter"] diff --git a/src/undate/converters/calendars/hebrew/converter.py b/src/undate/converters/calendars/hebrew/converter.py new file mode 100644 index 0000000..7d83dc7 --- /dev/null +++ b/src/undate/converters/calendars/hebrew/converter.py @@ -0,0 +1,71 @@ +from typing import Union + +from convertdate import hebrew # type: ignore +from lark.exceptions import UnexpectedCharacters + +from undate.converters.base import BaseCalendarConverter +from undate.converters.calendars.hebrew.parser import hebrew_parser +from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer +from undate.undate import Undate, UndateInterval + + +class HebrewDateConverter(BaseCalendarConverter): + """ + Converter for Hebrew Anno Mundicalendar. + + Support for parsing Anno Mundi dates and converting to Undate and UndateInterval + objects in the Gregorian calendar. + """ + + #: converter name: Hebrew + name: str = "Hebrew" + calendar_name: str = "Anno Mundi" + + def __init__(self): + self.transformer = HebrewDateTransformer() + + def min_month(self) -> int: + """first numeric month for the specified year in this calendar""" + # hebrew calendar civil year starts in Tishri + return hebrew.TISHRI + + def max_month(self) -> int: + """last numeric month for the specified year in this calendar""" + # hebrew calendar civil year starts in Tishri + # Elul is the month before Tishri + return hebrew.ELUL + + def max_day(self, year: int, month: int) -> int: + """maximum numeric day for the specified year and month in this calendar""" + # NOTE: unreleased v2.4.1 of convertdate standardizes month_days to month_length + return hebrew.month_days(year, month) + + def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: + """Convert a Hebrew date, specified by year, month, and day, + to the Gregorian equivalent date. Returns a tuple of year, month, day. + """ + return hebrew.to_gregorian(year, month, day) + + def parse(self, value: str) -> Union[Undate, UndateInterval]: + """ + Parse a Hebrew date string and return an :class:`~undate.undate.Undate` or + :class:`~undate.undate.UndateInterval`. + The Hebrew date string is preserved in the undate label. + """ + if not value: + raise ValueError("Parsing empty string is not supported") + + # parse the input string, then transform to undate object + try: + # parse the string with our Hebrew date parser + parsetree = hebrew_parser.parse(value) + # transform the parse tree into an undate or undate interval + undate_obj = self.transformer.transform(parsetree) + # set the original date as a label, with the calendar name + undate_obj.label = f"{value} {self.calendar_name}" + return undate_obj + except UnexpectedCharacters as err: + raise ValueError(f"Could not parse '{value}' as a Hebrew date") from err + + # do we need to support conversion the other direction? + # i.e., generate a Hebrew date from an abitrary undate or undate interval? diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/calendars/hebrew/hebrew.lark new file mode 100644 index 0000000..64e527b --- /dev/null +++ b/src/undate/converters/calendars/hebrew/hebrew.lark @@ -0,0 +1,55 @@ +%import common.WS +%ignore WS + +// only support day month year format for now +// parser requires numeric day and year to be distinguished based on order +hebrew_date: day month year | month year | year + +// TODO: handle date ranges? + +// TODO: add support for qualifiers? +// PGP dates use qualifiers like "first decade of" (for beginning of month) +// "first third of", seasons (can look for more examples) + +year: /\d+/ + +// months +month: month_1 + | month_2 + | month_3 + | month_4 + | month_5 + | month_6 + | month_7 + | month_8 + | month_9 + | month_10 + | month_11 + | month_12 + | month_13 +// months have 29 or 30 days; we do not expect leading zeroes +day: /[1-9]/ | /[12][0-9]/ | /30/ + +// months, in order; from convertdate list +// with variants from Princeton Geniza Project +// support matching with and without accents +month_1: "Nisan" +// Iyar or Iyyar +month_2: /Iyy?ar/ +month_3: "Sivan" +month_4: "Tammuz" +month_5: "Av" +month_6: "Elul" +// Tishrei or Tishri +month_7: /Tishre?i/ +month_8: "Heshvan" +month_9: "Kislev" +// Tevet or Teveth +month_10: /[ṬT]eveth?/ +month_11: "Shevat" +// Adar I or Adar +month_12: /Adar( I)?/ +// Adar II or Adar Bet +month_13: /Adar (II|Bet)/ + + diff --git a/src/undate/converters/calendars/hebrew/parser.py b/src/undate/converters/calendars/hebrew/parser.py new file mode 100644 index 0000000..5654f60 --- /dev/null +++ b/src/undate/converters/calendars/hebrew/parser.py @@ -0,0 +1,9 @@ +import pathlib + +from lark import Lark + +grammar_path = pathlib.Path(__file__).parent / "hebrew.lark" + +with open(grammar_path) as grammar: + # NOTE: LALR parser is faster but can't be used to ambiguity between years and dates + hebrew_parser = Lark(grammar.read(), start="hebrew_date", strict=True) diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py new file mode 100644 index 0000000..a6d2888 --- /dev/null +++ b/src/undate/converters/calendars/hebrew/transformer.py @@ -0,0 +1,40 @@ +from lark import Transformer, Tree + +from undate.undate import Undate, Calendar + + +class HebrewUndate(Undate): + """Undate convience subclass; sets default calendar to Hebrew.""" + + calendar = Calendar.HEBREW + + +class HebrewDateTransformer(Transformer): + """Transform a Hebrew date parse tree and return an Undate or + UndateInterval.""" + + def hebrew_date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one integer value; + # anonymous tokens convert to their value and cast as int + value = int(child.children[0]) + parts[str(child.data)] = value + + # initialize and return an undate with islamic year, month, day and + # islamic calendar + return HebrewUndate(**parts) + + # year translation is not needed since we want a tree with name year + # this is equivalent to a no-op + # def year(self, items): + # return Tree(data="year", children=[items[0]]) + + def month(self, items): + # month has a nested tree for the rule and the value + # the name of the rule (month_1, month_2, etc) gives us the + # number of the month needed for converting the date + tree = items[0] + month_n = tree.data.split("_")[-1] + return Tree(data="month", children=[month_n]) diff --git a/src/undate/converters/calendars/hijri/__init__.py b/src/undate/converters/calendars/hijri/__init__.py index 4ac5b4b..8c28d52 100644 --- a/src/undate/converters/calendars/hijri/__init__.py +++ b/src/undate/converters/calendars/hijri/__init__.py @@ -1,3 +1,4 @@ from undate.converters.calendars.hijri.converter import HijriDateConverter +from undate.converters.calendars.hebrew.converter import HebrewDateConverter -__all__ = ["HijriDateConverter"] +__all__ = ["HijriDateConverter", "HebrewDateConverter"] diff --git a/src/undate/converters/calendars/hijri/converter.py b/src/undate/converters/calendars/hijri/converter.py index 9a8ad72..910c67e 100644 --- a/src/undate/converters/calendars/hijri/converter.py +++ b/src/undate/converters/calendars/hijri/converter.py @@ -24,10 +24,6 @@ class HijriDateConverter(BaseCalendarConverter): def __init__(self): self.transformer = HijriDateTransformer() - def max_month(self, year: int) -> int: - """maximum numeric month for the specified year in this calendar""" - return 12 - def max_day(self, year: int, month: int) -> int: """maximum numeric day for the specified year and month in this calendar""" return islamic.month_length(year, month) @@ -41,8 +37,8 @@ def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: def parse(self, value: str) -> Union[Undate, UndateInterval]: """ Parse a Hijri date string and return an :class:`~undate.undate.Undate` or - :class:`~undate.undate.UndateInterval` in Gregorian calendar. - The Hijri date string is preserved in the undate label + :class:`~undate.undate.UndateInterval`. + The Hijri date string is preserved in the undate label. """ if not value: raise ValueError("Parsing empty string is not supported") diff --git a/src/undate/undate.py b/src/undate/undate.py index 8a10073..0c635c0 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -22,6 +22,7 @@ class Calendar(StrEnum): GREGORIAN = auto() HIJRI = auto() + HEBREW = auto() @staticmethod def get_converter(calendar): @@ -123,10 +124,11 @@ def calculate_earliest_latest(self, year, month, day): if month == "XX": month = None - min_month = 1 # is min month ever anything other than 1 ? - # get max month from the calendar, since it depends on the - # calendar and potentially the year (e.g. leap years in Hebrew Anno Mundi) - max_month = self.calendar_converter.max_month(max_year) + # get first and last month from the calendar, since it is not + # always 1 and 12 + # TODO need to differentiate between min/max and first/last! + min_month = self.calendar_converter.min_month() + max_month = self.calendar_converter.max_month() if month is not None: try: # treat as an integer if we can @@ -137,7 +139,9 @@ def calculate_earliest_latest(self, year, month, day): except ValueError: # if not, calculate min/max for missing digits min_month, max_month = self._missing_digit_minmax( - str(month), min_month, max_month + str(month), + 1, + 12, # min_month, max_month ) # similar to month above — unknown day, but day-level granularity if day == "XX": diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py new file mode 100644 index 0000000..f335975 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py @@ -0,0 +1,142 @@ +import pytest + +from undate.converters.calendars.hebrew.converter import HebrewDateConverter +from undate.converters.calendars.hebrew.transformer import HebrewUndate +from undate.undate import Calendar, Undate +from undate.date import DatePrecision, Date + + +class TestHebrewDateConverter: + def test_parse(self): + # day + # 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056 Gregorian) + date_str = "26 Tammuz 4816" + date = HebrewDateConverter().parse(date_str) + assert date == HebrewUndate(4816, 4, 26) + assert date.calendar == Calendar.HEBREW + assert date.precision == DatePrecision.DAY + assert date.label == f"{date_str} {HebrewDateConverter.calendar_name}" + + # month + date_str = "Ṭevet 5362" + date = HebrewDateConverter().parse(date_str) + assert date == HebrewUndate(5362, 10) # Teveth = month 10 + assert date.calendar == Calendar.HEBREW + assert date.precision == DatePrecision.MONTH + assert date.label == f"{date_str} {HebrewDateConverter.calendar_name}" + + # year + date_str = "4932" + date = HebrewDateConverter().parse(date_str) + assert date == HebrewUndate(4932) + assert date.calendar == Calendar.HEBREW + assert date.precision == DatePrecision.YEAR + assert date.label == f"{date_str} {HebrewDateConverter.calendar_name}" + + def test_gregorian_earliest_latest(self): + # earliest/latest should be converted to Gregorian for comparison + + # full date + + # 26 Tammuz 4816: 17 July, 1056; Tammuz = month 4 + date = HebrewUndate(4816, 4, 26) + assert date.earliest == Date(1056, 7, 17) + assert date.latest == Date(1056, 7, 17) + # 13 Tishrei 5416 Anno Mundi (1655-10-14) + date = HebrewUndate(5416, 7, 13) # Tishrei = month 7 + assert date.earliest == Date(1655, 10, 14) + assert date.latest == Date(1655, 10, 14) + + # month + + # Ṭevet 5362 Anno Mundi (25 December, 1601 – 22 January, 1602) + date = HebrewUndate(5362, 10) + assert date.earliest == Date(1601, 12, 25) + assert date.latest == Date(1602, 1, 22) + + # year + # 5416 : October 1655 to September 1656 + date = HebrewUndate(5416) + assert date.earliest == Date(1655, 10, 2) + assert date.latest == Date(1656, 9, 18) + + def test_parse_error(self): + # a string we can't parse should raise an error + with pytest.raises(ValueError): + HebrewDateConverter().parse("January 2, 1991") + # empty string should also error + with pytest.raises(ValueError): + HebrewDateConverter().parse("") + + def test_partially_known(self): + # hebrew dates get existing partially unknown behavior + + converter = HebrewDateConverter() + + # hebrew first/last month are not the same as min/max + unknown_month = HebrewUndate(1243, "XX") + assert unknown_month.precision == DatePrecision.MONTH + assert unknown_month.earliest == Date( + *converter.to_gregorian(1243, converter.min_month(), 1) + ) + max_month = converter.max_month() + assert unknown_month.latest == Date( + *converter.to_gregorian(1243, max_month, converter.max_day(1243, max_month)) + ) + + partially_unknown_month = HebrewUndate(1243, "1X") + assert partially_unknown_month.precision == DatePrecision.MONTH + assert partially_unknown_month.earliest == Date( + *converter.to_gregorian(1243, 10, 1) + ) + assert partially_unknown_month.latest == Date( + *converter.to_gregorian(1243, 12, 30) + ) + + # second month has 29 days + unknown_day = HebrewUndate(1243, 2, "XX") + assert unknown_day.precision == DatePrecision.DAY + assert unknown_day.earliest == Date(*converter.to_gregorian(1243, 2, 1)) + assert unknown_day.latest == Date(*converter.to_gregorian(1243, 2, 29)) + + partially_unknown_day = HebrewUndate(1243, 2, "2X") + assert partially_unknown_day.precision == DatePrecision.DAY + assert partially_unknown_day.earliest == Date( + *converter.to_gregorian(1243, 2, 20) + ) + assert partially_unknown_day.latest == Date( + *converter.to_gregorian(1243, 2, 29) + ) + + def test_compare_across_calendars(self): + # only day-precision dates can be exactly equal across calendars + + # 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056 Gregorian) + assert HebrewUndate(4816, 4, 26) == Undate(1056, 7, 17) + # 13 Tishrei 5416; Tieshrei = month 7 (1655-10-14) + assert HebrewUndate(5416, 7, 13) == Undate(1655, 10, 14) + + # greater than / less than + assert HebrewUndate(4816) < Undate(1060) + assert HebrewUndate(5416) < Undate(1660) + assert HebrewUndate(5416, 7) > Undate(1655, 1) + assert HebrewUndate(4816, 4, 26) > Undate(1055, 5) + + # 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056) + # so it falls within or is c ontained by July 1056 + assert HebrewUndate(4816, 4, 26) in Undate(1056, 7) + assert HebrewUndate(4816, 4, 26) not in Undate(1054) + + # sorting + sorted_dates = sorted( + [ + HebrewUndate(4816, 4, 26), # 1056-07-17 + HebrewUndate(5416), # 1655 + HebrewUndate(500), # -3261 + Undate(1995), + Undate(33), + Undate(1350), + ] + ) + expected_gregorian_years = [-3261, 33, 1056, 1350, 1655, 1995] + assert [d.earliest.year for d in sorted_dates] == expected_gregorian_years diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py new file mode 100644 index 0000000..e4894b1 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py @@ -0,0 +1,63 @@ +import pytest +from undate.converters.calendars.hebrew.parser import hebrew_parser + + +# for now, just test that valid dates can be parsed + +testcases = [ + # year + "5362", + # month + year + # - with and without accent + "Ṭevet 5362", + "Tevet 5362", + "Elul 4932", + "Sivan 5581", + # variant month name, with or without accent + "Ṭeveth 5362", + "Teveth 5362", + "Iyyar 1526", + "Iyar 1526", + # day month year + "26 Tammuz 4816", + "7 Heshvan 5425", + "26 Tishrei 5416", + "26 Tishri 5416", + "14 Adar 5403", + "14 Adar I 5403", + "9 Adar II 5404", + "9 Adar Bet 5404", + # two and 1 digit years + "536", + "53", + "3", +] + + +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert hebrew_parser.parse(date_string) + + +error_cases = [ + # invalid days + "0 Tammuz 5403", + "31 Tishri 5403", + # month alone + "Tishri", + # month day only + "12 Heshvan", + # invalid month + "Foo 383", + # wrong format + "2024-10-02", + # year month day not supported + "5403 Adar", + "5403 Adar 14", +] + + +@pytest.mark.parametrize("date_string", error_cases) +def test_should_error(date_string): + with pytest.raises(Exception): + hebrew_parser.parse(date_string) diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py new file mode 100644 index 0000000..6e4a5e6 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py @@ -0,0 +1,43 @@ +import pytest +from undate.converters.calendars.hebrew.parser import hebrew_parser +from undate.converters.calendars.hebrew.transformer import ( + HebrewDateTransformer, + HebrewUndate, +) +from undate.undate import Undate, Calendar +from undate.date import DatePrecision + + +def test_hebrew_undate(): + assert HebrewUndate(848).calendar == Calendar.HEBREW + + +testcases = [ + # examples from Princeton Geniza Project + # date conversions checked with https://www.muqawwim.com/ + # 26 Tammuz 4816; Tammuz = month 4 + ("26 Tammuz 4816", HebrewUndate(4816, 4, 26), DatePrecision.DAY), + ("Tammuz 4816", HebrewUndate(4816, 4), DatePrecision.MONTH), + ("4816", HebrewUndate(4816), DatePrecision.YEAR), + # 26 Tishrei 5416: Tishrei = month 7 + ("26 Tishrei 5416", HebrewUndate(5416, 7, 26), DatePrecision.DAY), + # Ṭeveth = month 10 + ("Ṭevet 5362", HebrewUndate(5362, 10), DatePrecision.MONTH), + ("5362", HebrewUndate(5362), DatePrecision.YEAR), + # add when we support parsing ranges: + # Adar I and Adar II 5453 : (1693 CE) +] + + +@pytest.mark.parametrize("date_string,expected,expected_precision", testcases) +def test_transform(date_string, expected, expected_precision): + transformer = HebrewDateTransformer(visit_tokens=True) + # parse the input string, then transform to undate object + parsetree = hebrew_parser.parse(date_string) + transformed_date = transformer.transform(parsetree) + assert transformed_date == expected + # currently only undates have date precision + if isinstance(transformed_date, Undate): + assert transformed_date.precision == expected_precision + # transformer doesn't have access to date string, + # label will need to be set by the converter class