Skip to content

Commit

Permalink
[NL DateTimeV2] Dutch datetime support in Python (#3048)
Browse files Browse the repository at this point in the history
  • Loading branch information
samhickey25 authored Feb 6, 2023
1 parent c48d08a commit 4071e4e
Show file tree
Hide file tree
Showing 50 changed files with 8,783 additions and 3,063 deletions.
26 changes: 13 additions & 13 deletions Patterns/Dutch/Dutch-DateTime.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ LangMarker: Dut
# Note: CheckBothBeforeAfter is set to true in DutchSetExtractorConfiguration
CheckBothBeforeAfter: !bool false
TillRegex: !nestedRegex
def: (?<till>\b(tot(dat|\s+en\s+met)?|gedurende|tijdens|ten tijde van)\b|{BaseDateTime.RangeConnectorSymbolRegex})
def: (?<till>\b(tot(dat|\s+en\s+met)?|en|gedurende|tijdens|ten tijde van)\b|{BaseDateTime.RangeConnectorSymbolRegex})
references: [ BaseDateTime.RangeConnectorSymbolRegex ]
RangeConnectorRegex: !nestedRegex
def: (?<and>\b(en|t/m|tot(\s+(aan|en\s+met))?)\b|{BaseDateTime.RangeConnectorSymbolRegex})
Expand Down Expand Up @@ -47,6 +47,10 @@ PastSuffixRegex: !simpleRegex
DayRegex: !simpleRegex
def: (de\s*)?(?<!(\d+:|\$)\s*)(?<day>(?:3[0-1]|[1-2]\d|0?[1-9]))(?:\s*(ste|de|e))?(?=\b|t)
# 1-31 written
WrittenOneToNineRegex: !simpleRegex
def: (één|een|twee|drie|vier|vijf|zes|zeven|acht|negen)
WrittenElevenToNineteenRegex: !simpleRegex
def: (elf|elven|twaalf|dertien|veertien|vijftien|zestien|zeventien|achttien|negentien)
WrittenDayRegex: !nestedRegex
def: (?<day>({WrittenOneToNineRegex})|({WrittenElevenToNineteenRegex})|(({WrittenOneToNineRegex}(en|ën))?twintig)|(((één|een)(en|ën))?dertig))
references: [ WrittenOneToNineRegex, WrittenElevenToNineteenRegex ]
Expand All @@ -57,10 +61,6 @@ ImplicitDayRegex: !simpleRegex
def: (de\s*)?(?<day>(3[0-1]|[0-2]?\d)(\s*(ste|de|e)))\b
MonthNumRegex: !simpleRegex
def: \b(?<month>01|02|03|04|05|06|07|08|09|10|11|12|1|2|3|4|5|6|7|8|9)\b
WrittenOneToNineRegex: !simpleRegex
def: (één|een|twee|drie|vier|vijf|zes|zeven|acht|negen)
WrittenElevenToNineteenRegex: !simpleRegex
def: (elf|elven|twaalf|dertien|veertien|vijftien|zestien|zeventien|achttien|negentien)
WrittenTensRegex: !simpleRegex
def: (tien|twintig|dertig|veertig|vijftig|zestig|zeventig|tachtig|negentig)
WrittenNumRegex: !nestedRegex
Expand Down Expand Up @@ -97,7 +97,7 @@ DescRegex: !nestedRegex
def: (:?(:?({OclockRegex}\s+)?(?<desc>({AmPmDescRegex}|{AmDescRegex}|{PmDescRegex}|{SpecialDescRegex}))\.?)|{OclockRegex})
references: [ OclockRegex, AmDescRegex, PmDescRegex, AmPmDescRegex, SpecialDescRegex ]
PmRegex: !nestedRegex
def: (?<pm>({ApostrofsRegex}|des)\s+(\bmiddags|avonds|nachts)|((in|tegen|op|om|met)\s+(de\s+)?)(((na)?middag|avond|(midder)?nacht|lunchtijd))|dag)
def: (?<pm>({ApostrofsRegex}|des)\s+(\bmiddags|avonds|nachts)|((in|tegen|op|om|met)\s+(de\s+)?)(((na)?middag|avond|(midder)?nacht|lunchtijd))|\s+dag)
references: [ ApostrofsRegex ]
PmRegexFull: !nestedRegex
def: (?<pm>(({ApostrofsRegex}|des)\s+(\bmiddags|avonds|nachts)|((in|tegen|op|om|met)\s+(de\s+)?)?(((na)?middag|(?<!kerst|oude?jaars)avond|(midder)?nacht|lunchtijd))))
Expand Down Expand Up @@ -427,6 +427,8 @@ PeriodHourNumRegex: !simpleRegex
ConnectNumRegex: !nestedRegex
def: '\b{BaseDateTime.HourRegex}(?<min>00|01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35|36|37|38|39|40|41|42|43|44|45|46|47|48|49|50|51|52|53|54|55|56|57|58|59)\s*{DescRegex}'
references: [ BaseDateTime.HourRegex, DescRegex ]
AroundRegex: !simpleRegex
def: (\b(rond(om)?|ongeveer(\s+om)?)\s*\b)
TimeRegexWithDotConnector: !nestedRegex
def: ({BaseDateTime.HourRegex}(\s*\.\s*){BaseDateTime.MinuteRegex}(\s*:\s*{BaseDateTime.SecondRegex})?(\s*u\s*)?)
references: [ BaseDateTime.HourRegex, BaseDateTime.MinuteRegex, BaseDateTime.SecondRegex ]
Expand Down Expand Up @@ -501,6 +503,8 @@ SpecificTimeBetweenAnd: !nestedRegex
references: [ TimeRegex2, RangeConnectorRegex, HourDTRegEx, PeriodHourNumRegex, DescRegex, PmRegexFull, AmRegex, TimeSuffix ]
PrepositionRegex: !simpleRegex
def: (?<prep>^(om|rond|tegen|op|van|deze)(\s+de)?$)
MealTimeRegex: !simpleRegex
def: \b((((tijdens\s+)?de|het)\s+)?(?<mealTime>ontbijt|lunch|avondeten)|((om|tegen|tijdens)\s+)?(?<mealTime>lunchtijd))\b
EarlyLateRegex: !simpleRegex
def: \b(((?<early>vroege?|(in\s+het\s+)?(begin))|(?<late>laat|later|late|aan\s+het\s+einde?))((\s+|-)(in\s+de|op\s+de|van\s+de|deze|in|op|van|de))?)
TimeOfDayRegex: !nestedRegex
Expand Down Expand Up @@ -546,6 +550,8 @@ UnspecificEndOfRegex: !simpleRegex
def: \b(((om|rond|tegen|op)\s+)?het\s+)?(einde?\s+van\s+(de\s+)?dag)\b
UnspecificEndOfRangeRegex: !simpleRegex
def: \b(evj)\b
MiddlePauseRegex: !simpleRegex
def: \s*(,)\s*
PeriodTimeOfDayRegex: !nestedRegex
def: ((in\s+(de)?\s+)?({EarlyLateRegex}(\s+|-))?(zondag|maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|(eer)?gisteren|morgen)?(?<timeOfDay>ochtend|(na)?middag|avond|nacht))\b
references: [ EarlyLateRegex ]
Expand All @@ -566,7 +572,7 @@ DurationUnitRegex: !nestedRegex
def: (?<unit>{DateUnitRegex}|(min\.|sec\.)|((?<half>halfuur)|(?<quarter>kwartier\s+uur)|(?<quarter>kwartier)|uur|uren|u|minuten|minuut|m(ins?)?|seconde[ns]?|s(ecs?)?|nacht(en)?)\b)(\s+lang\b)?
references: [ DateUnitRegex ]
SuffixAndRegex: !simpleRegex
def: (?<suffix>\s*(en|ën)(\s*een)?\s*(?<suffix_num>hal(f|ve)|kwart|kwartier)|(?<suffix_num>(een\s+)?kwartier))
def: (?<suffix>\s(\s*en|ën)(\s*een)?\s*(?<suffix_num>hal(f|ve)|kwart|kwartier)|(?<suffix_num>(een\s+)?kwartier))
PeriodicRegex: !simpleRegex
def: \b(?<periodic>dagelijkse?|(drie)?maandelijkse?|wekelijkse?|twee-?wekelijkse?|(half)?jaarlijkse?|kwartaal)\b
EachUnitRegex: !nestedRegex
Expand Down Expand Up @@ -641,8 +647,6 @@ BeforeRegex: !nestedRegex
references: [ InclusiveModPrepositions ]
SinceRegex: !simpleRegex
def: (\b(sinds|na\s+of\s+gelijk\s+aan|(startend|beginnend)\s+(vanaf|op|met)|(al\s+)?zo\s+vroeg\s+als|(elk|ieder)\s+moment\s+vanaf|een\s+tijdstip\s+vanaf)\b\s*)|(?<!\w|<)(>=)
AroundRegex: !simpleRegex
def: (\b(rond(om)?|ongeveer(\s+om)?)\s*\b)
AgoRegex: !simpleRegex
def: \b(geleden|(voor|eerder\s+dan)\s+(?<day>gisteren|vandaag))\b
LaterRegex: !simpleRegex
Expand Down Expand Up @@ -720,8 +724,6 @@ RestOfDateRegex: !simpleRegex
def: \brest\s+(van\s+)?((de|het|mijn|dit|deze|(de\s+)?huidige)\s+)?(?<duration>week|maand|jaar|decennium)\b
RestOfDateTimeRegex: !simpleRegex
def: \brest\s+(van\s+)?((de|het|mijn|dit|deze|(de\s+)?huidige)\s+)?(?<unit>vandaag|dag)\b
MealTimeRegex: !simpleRegex
def: \b((((tijdens\s+)?de|het)\s+)?(?<mealTime>ontbijt|lunch|avondeten)|((om|tegen|tijdens)\s+)?(?<mealTime>lunchtijd))\b
AmbiguousRangeModifierPrefix: !simpleRegex
def: (voor)
PotentialAmbiguousRangeRegex: !nestedRegex
Expand All @@ -740,8 +742,6 @@ WeekWithWeekDayRangeRegex: !nestedRegex
references: [NextPrefixRegex, PreviousPrefixRegex, WeekDayRegex]
GeneralEndingRegex: !simpleRegex
def: ^\s*((\.,)|\.|,|!|\?)?\s*$
MiddlePauseRegex: !simpleRegex
def: \s*(,)\s*
DurationConnectorRegex: !simpleRegex
def: ^\s*(?<connector>\s+|en|,)\s*$
PrefixArticleRegex: !simpleRegex
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@
from .chinese import *
from .french import *
from .portuguese import *
from .dutch import *
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ def match_duration(self, source: str, reference: datetime) -> List[Token]:
durations = []
duration_extractions = self.config.duration_extractor.extract(source, reference)

for duration_extraction in self.config.duration_extractor.extract(source, reference):
for duration_extraction in duration_extractions:
match = self.config.date_unit_regex.search(duration_extraction.text)
if match:
durations.append(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
from .german.common_configs import GermanCommonDateTimeParserConfiguration
from .german.merged_extractor_config import GermanMergedExtractorConfiguration
from .german.merged_parser_config import GermanMergedParserConfiguration
from .dutch.common_configs import DutchCommonDateTimeParserConfiguration
from .dutch.merged_extractor_config import DutchMergedExtractorConfiguration
from .dutch.merged_parser_config import DutchMergedParserConfiguration


class DateTimeRecognizer(Recognizer[DateTimeOptions]):
Expand Down Expand Up @@ -86,6 +89,12 @@ def initialize_configuration(self):
BaseMergedExtractor(GermanMergedExtractorConfiguration(), options)
))

self.register_model('DateTimeModel', Culture.Dutch, lambda options: DateTimeModel(
BaseMergedParser(DutchMergedParserConfiguration(
DutchCommonDateTimeParserConfiguration()), options),
BaseMergedExtractor(DutchMergedExtractorConfiguration(), options)
))

def get_datetime_model(self, culture: str = None, fallback_to_default_culture: bool = True) -> Model:
return self.get_model('DateTimeModel', culture, fallback_to_default_culture)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

from .base_configs import *
from .common_configs import *
from .duration_extractor_config import *
from .date_extractor_config import *
from .time_extractor_config import *
from .datetime_extractor_config import *
from .dateperiod_extractor_config import *
from .timeperiod_extractor_config import *
from .datetimeperiod_extractor_config import *
from .set_extractor_config import *
from .holiday_extractor_config import *
from .merged_extractor_config import *
from .duration_parser_config import *
from .date_parser_config import *
from .time_parser_config import *
from .datetime_parser_config import *
from .dateperiod_parser_config import *
from .timeperiod_parser_config import *
from .datetimeperiod_parser_config import *
from .set_parser_config import *
from .holiday_parser_config import *
from .merged_parser_config import *
from .parsers import *
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

from typing import Pattern
from recognizers_text.utilities import RegExpUtility
from ...resources.dutch_date_time import DutchDateTime
from ..base_date import DateTimeUtilityConfiguration


class DutchDateTimeUtilityConfiguration(DateTimeUtilityConfiguration):
@property
def date_unit_regex(self) -> Pattern:
return self._date_unit_regex

@property
def check_both_before_after(self) -> Pattern:
return self._check_both_before_after

@property
def range_prefix_regex(self) -> Pattern:
return self._range_prefix_regex

@property
def ago_regex(self) -> Pattern:
return self._ago_regex

@property
def later_regex(self) -> Pattern:
return self._later_regex

@property
def in_connector_regex(self) -> Pattern:
return self._in_connector_regex

@property
def range_unit_regex(self) -> Pattern:
return self._range_unit_regex

@property
def am_desc_regex(self) -> Pattern:
return self._am_desc_regex

@property
def pm_desc__regex(self) -> Pattern:
return self._pm_desc__regex

@property
def am_pm_desc_regex(self) -> Pattern:
return self._am_pm_desc_regex

@property
def time_unit_regex(self) -> Pattern:
return self._time_unit_regex

@property
def within_next_prefix_regex(self) -> Pattern:
return self._within_next_prefix_regex

@property
def common_date_prefix_regex(self) -> Pattern:
return self._common_date_prefix_regex

@property
def since_year_suffix_regex(self) -> Pattern:
return self._since_year_suffix_regex

def __init__(self):
self._later_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.LaterRegex)
self._ago_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.AgoRegex)
self._in_connector_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.InConnectorRegex)
self._range_unit_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.RangeUnitRegex)
self._am_desc_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.AmDescRegex)
self._pm_desc__regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.PmDescRegex)
self._am_pm_desc_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.AmPmDescRegex)
self._time_unit_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.TimeUnitRegex)
self._within_next_prefix_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.WithinNextPrefixRegex)
self._common_date_prefix_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.CommonDatePrefixRegex)
self._check_both_before_after = DutchDateTime.CheckBothBeforeAfter
self._range_prefix_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.RangePrefixRegex
)
self._date_unit_regex = RegExpUtility.get_safe_reg_exp(
DutchDateTime.DateUnitRegex
)
self._since_year_suffix_regex = RegExpUtility.get_safe_reg_exp(DutchDateTime.SinceYearSuffixRegex)
Loading

0 comments on commit 4071e4e

Please sign in to comment.