From f3c02913a67db488041c9f4fbd2cdab76d0faa53 Mon Sep 17 00:00:00 2001 From: Mai <42946515+Maihav@users.noreply.github.com> Date: Thu, 22 Aug 2019 23:53:15 -0300 Subject: [PATCH] [Python][Recognizers-Text] NumberWithUnit class Refactor (#1815) * Add fixes to matcher and add sequence * Signature Migrations * Add abstract_matcher signatures * Change type of value to [srt] * Add the matcher class into a new file * Delete extra decorators & insert class * Fix matcher implementation * Add AANode methods implementation * Add get_tokenized_text implementation * Apply PR's feedback * Fix autopep * Add missing methods to AaNode class * Add tests to string_matcher * Add missing Init implementations using multipledispatch * Finishing touches and implementation with passing tests * Add Spanish-Portuguese-French * Json spec fixes * Feedback Applied * Minor fixes of interfaces --- .../date_time/chinese/time_parser.py | 1 - .../number_with_unit/chinese/extractors.py | 26 +- .../number_with_unit/english/extractors.py | 23 + .../number_with_unit/extractors.py | 417 ++++++++++++------ .../number_with_unit/french/extractors.py | 5 + .../number_with_unit/models.py | 21 +- .../number_with_unit/parsers.py | 15 +- .../number_with_unit/portuguese/extractors.py | 4 + .../number_with_unit/spanish/extractors.py | 4 + .../recognizers_number/number/extractors.py | 3 +- .../Matcher/number_with_unit_tokenizer.py | 12 +- .../Matcher/string_matcher.py | 3 +- .../matcher/abstract_matcher.py | 28 ++ .../recognizers_text/matcher/matcher.py | 14 + Python/tests/matcher/test_string_matcher.py | 2 +- Python/tests/runner.py | 1 - .../NumberWithUnit/Chinese/CurrencyModel.json | 8 +- .../NumberWithUnit/English/CurrencyModel.json | 11 +- .../English/DimensionModel.json | 6 +- .../NumberWithUnit/French/CurrencyModel.json | 2 +- .../NumberWithUnit/French/DimensionModel.json | 2 +- .../Portuguese/DimensionModel.json | 2 +- .../Spanish/DimensionModel.json | 2 +- 23 files changed, 431 insertions(+), 181 deletions(-) create mode 100644 Python/libraries/recognizers-text/recognizers_text/matcher/abstract_matcher.py create mode 100644 Python/libraries/recognizers-text/recognizers_text/matcher/matcher.py diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/chinese/time_parser.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/chinese/time_parser.py index 47552f529d..c9cda5e22c 100644 --- a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/chinese/time_parser.py +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/chinese/time_parser.py @@ -70,7 +70,6 @@ def handle_less(self, extra: DateTimeExtra) -> TimeResult: return TimeResult(_all / 60, _all % 60, second) def handle_digit(self, extra: DateTimeExtra) -> TimeResult: - print(extra.named_entity) hour = self.match_to_value(next(iter(extra.named_entity['hour']), '')) minute = self.match_to_value(next(iter(extra.named_entity['min']), '')) second = self.match_to_value(next(iter(extra.named_entity['sec']), '')) diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/chinese/extractors.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/chinese/extractors.py index ca1f97e52a..6eb80638f5 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/chinese/extractors.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/chinese/extractors.py @@ -41,6 +41,30 @@ def non_unit_regex(self) -> Pattern: def ambiguous_unit_number_multiplier_regex(self) -> Pattern: return None + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + return None + + @property + def extract_type(self) -> str: + raise NotImplementedError + + @property + def suffix_list(self) -> Dict[str, str]: + raise NotImplementedError + + @property + def prefix_list(self) -> Dict[str, str]: + raise NotImplementedError + + @property + def ambiguous_unit_list(self) -> List[str]: + raise NotImplementedError + + @property + def culture_info(self) -> CultureInfo: + return self._culture_info + def __init__(self, culture_info: CultureInfo): if culture_info is None: culture_info = CultureInfo(Culture.Chinese) @@ -98,7 +122,7 @@ def prefix_list(self) -> Dict[str, str]: def ambiguous_unit_list(self) -> List[str]: return self._ambiguous_unit_list - def __init__(self, culture_info: CultureInfo = None): + def __init__(self, culture_info: CultureInfo = Culture.Chinese): super().__init__(culture_info) self._suffix_list = ChineseNumericWithUnit.CurrencySuffixList self._prefix_list = ChineseNumericWithUnit.CurrencyPrefixList diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/english/extractors.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/english/extractors.py index 22bd292cab..8b75b32d17 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/english/extractors.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/english/extractors.py @@ -14,6 +14,10 @@ # pylint: disable=abstract-method class EnglishNumberWithUnitExtractorConfiguration(NumberWithUnitExtractorConfiguration): + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + return EnglishNumericWithUnit.AmbiguityFiltersDict + @property def unit_num_extractor(self) -> Extractor: return self._unit_num_extractor @@ -58,6 +62,10 @@ def __init__(self, culture_info: CultureInfo): # pylint: enable=abstract-method class EnglishAgeExtractorConfiguration(EnglishNumberWithUnitExtractorConfiguration): + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + return EnglishNumericWithUnit.AmbiguityFiltersDict + @property def extract_type(self) -> str: return Constants.SYS_UNIT_AGE @@ -82,6 +90,11 @@ def __init__(self, culture_info: CultureInfo = None): class EnglishCurrencyExtractorConfiguration(EnglishNumberWithUnitExtractorConfiguration): + + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + return EnglishNumericWithUnit.AmbiguityFiltersDict + @property def extract_type(self) -> str: return Constants.SYS_UNIT_CURRENCY @@ -106,6 +119,11 @@ def __init__(self, culture_info: CultureInfo = None): class EnglishDimensionExtractorConfiguration(EnglishNumberWithUnitExtractorConfiguration): + + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + return EnglishNumericWithUnit.AmbiguityFiltersDict + @property def extract_type(self) -> str: return Constants.SYS_UNIT_DIMENSION @@ -137,6 +155,11 @@ def __init__(self, culture_info: CultureInfo = None): class EnglishTemperatureExtractorConfiguration(EnglishNumberWithUnitExtractorConfiguration): + + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + return EnglishNumericWithUnit.AmbiguityFiltersDict + @property def extract_type(self) -> str: return Constants.SYS_UNIT_TEMPERATURE diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/extractors.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/extractors.py index d0fa2f9466..3d868592c7 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/extractors.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/extractors.py @@ -4,16 +4,24 @@ from collections import namedtuple from itertools import chain import regex - from .constants import * from recognizers_text.utilities import RegExpUtility from recognizers_text.extractor import Extractor, ExtractResult from recognizers_number.culture import CultureInfo +from recognizers_text.Matcher.string_matcher import StringMatcher +from recognizers_text.Matcher.match_strategy import MatchStrategy +from recognizers_text.Matcher.number_with_unit_tokenizer import NumberWithUnitTokenizer +from recognizers_text.Matcher.match_result import MatchResult + PrefixUnitResult = namedtuple('PrefixUnitResult', ['offset', 'unit']) class NumberWithUnitExtractorConfiguration(ABC): + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + pass + @property @abstractmethod def extract_type(self) -> str: @@ -78,142 +86,203 @@ def __init__(self, culture_info: CultureInfo): class NumberWithUnitExtractor(Extractor): + + @property + def separator(self): + return ['|'] + + @property + def max_prefix_match_len(self): + return self.__max_prefix_match_len + + @max_prefix_match_len.setter + def max_prefix_match_len(self, value): + self.__max_prefix_match_len = value + + @property + def prefix_matcher(self): + return self.__prefix_matcher + + @prefix_matcher.setter + def prefix_matcher(self, value): + self.__prefix_matcher = value + + @property + def suffix_matcher(self): + return self.__suffix_matcher + + @suffix_matcher.setter + def suffix_matcher(self, value): + self.__suffix_matcher = value + + @property + def separate_regex(self): + return self.__separate_regex + + @separate_regex.setter + def separate_regex(self, value): + self.__separate_regex = value + def __init__(self, config: NumberWithUnitExtractorConfiguration): - self.config: NumberWithUnitExtractorConfiguration = config + self.config = config + self.max_prefix_match_len = 0 if self.config.suffix_list: - self.suffix_regex: Set[Pattern] = self._build_regex_from_set( - self.config.suffix_list.values()) + self.__suffix_matcher = self._build_matcher_from_set( + list(self.config.suffix_list.values())) else: - self.suffix_regex: Set[Pattern] = set() + self.__suffix_matcher = StringMatcher() if self.config.prefix_list: - max_length = max( - map(len, ('|'.join(self.config.prefix_list.values()).split('|')))) - - self.max_prefix_match_len = max_length + 2 - self.prefix_regex: Set[Pattern] = self._build_regex_from_set( - self.config.prefix_list.values()) + for pre_match in self.config.prefix_list.values(): + match_list = str(pre_match).split(self.separator[0]) + for match in match_list: + if self.max_prefix_match_len >= len(match): + self.max_prefix_match_len = self.max_prefix_match_len + else: + self.max_prefix_match_len = len(match) + + # 2 is the maximum length of spaces. + self.max_prefix_match_len += 2 + self.__prefix_matcher = self._build_matcher_from_set(self.config.prefix_list.values()) else: - self.max_prefix_match_len = 0 - self.prefix_regex: Set[Pattern] = set() + self.__prefix_matcher = StringMatcher() + self.separate_regex = self._build_separate_regex_from_config() def extract(self, source: str) -> List[ExtractResult]: + if not self._pre_check_str(source): - return list() + return [] + + non_unit_match = None mapping_prefix: Dict[float, PrefixUnitResult] = dict() - matched: List[bool] = [False] * len(source) - numbers: List[ExtractResult] = self.config.unit_num_extractor.extract( - source) - result: List[ExtractResult] = list() - source_len = len(source) - - # Special case for cases where number multipliers clash with unit - ambiguous_multiplier_regex = self.config.ambiguous_unit_number_multiplier_regex - if ambiguous_multiplier_regex is not None: - - for num in numbers: - match = list(filter(lambda x: x.group(), regex.finditer( - ambiguous_multiplier_regex, num.text))) - if match and len(match) == 1: - new_length = num.length - \ - (match[0].span()[1] - match[0].span()[0]) - num.text = num.text[0:new_length] - num.length = new_length - - # Mix prefix and numbers, make up a prefix-number combination - if self.max_prefix_match_len != 0: - for num in numbers: - if num.start is None or num.length is None: - continue - max_find_prefix = min(self.max_prefix_match_len, num.start) - if max_find_prefix == 0: - continue + matched = [False] * len(source) + result = [] + prefix_matched = False + prefix_match: List[MatchResult] = sorted(self.prefix_matcher.find(source), key=lambda o: o.start) + suffix_match: List[MatchResult] = sorted(self.suffix_matcher.find(source), key=lambda o: o.start) - left: str = source[num.start - max_find_prefix:num.start] - last_index = len(left) - best_match: Match = None - for pattern in self.prefix_regex: - collection = list(filter(lambda x: len( - x.group()), regex.finditer(pattern, left))) - for match in collection: - if left[match.start():last_index].strip() == match.group(): - if best_match is None or best_match.start() >= match.start(): - best_match = match - if best_match: - mapping_prefix[num.start] = PrefixUnitResult( - offset=last_index - best_match.start(), - unit=left[best_match.start():last_index] - ) - for num in numbers: - if num.start is None or num.length is None: - continue - start = num.start - length = num.length - max_find_len = source_len - start - length - - prefix_unit: PrefixUnitResult = mapping_prefix.get(start, None) - - if max_find_len > 0: - right = source[start + length:start + length + max_find_len] - unit_match_list = map(lambda x: list( - regex.finditer(x, right)), self.suffix_regex) - unit_match = chain.from_iterable(unit_match_list) - unit_match = list(filter(lambda x: x.group(), unit_match)) - - max_len = 0 - for match in unit_match: - if match.group(): - end_pos = match.start() + len(match.group()) - if match.start() >= 0: - middle: str = right[:min( - match.start(), len(right))] - if max_len < end_pos and (not middle.strip() or middle.strip() == self.config.connector_token): - max_len = end_pos - if max_len != 0: - for i in range(length + max_len): - matched[i + start] = True - ex_result = ExtractResult() - ex_result.start = start - ex_result.length = length + max_len - ex_result.text = source[start:start + length + max_len] - ex_result.type = self.config.extract_type - - if prefix_unit: - ex_result.start -= prefix_unit.offset - ex_result.length += prefix_unit.offset - ex_result.text = prefix_unit.unit + ex_result.text - - num.start = start - ex_result.start - ex_result.data = num - - is_not_unit = False - if ex_result.type == Constants.SYS_UNIT_DIMENSION: - non_unit_match = self.config.non_unit_regex.finditer( - source) - for match in non_unit_match: - if ex_result.start >= match.start() and ex_result.end <= match.end(): - is_not_unit = True - - if is_not_unit: - continue - - result.append(ex_result) - continue - if prefix_unit: - ex_result = ExtractResult() - ex_result.start = num.start - prefix_unit.offset - ex_result.length = num.length + prefix_unit.offset - ex_result.text = prefix_unit.unit + num.text - ex_result.type = self.config.extract_type + if len(prefix_match) > 0 or len(suffix_match) > 0: + + numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start) - num.start = start - ex_result.start - ex_result.data = num - result.append(ex_result) + # Special case for cases where number multipliers clash with unit + ambiguous_multiplier_regex = self.config.ambiguous_unit_number_multiplier_regex + if ambiguous_multiplier_regex is not None: + for num in numbers: + match = list(filter(lambda x: x.group(), regex.finditer( + ambiguous_multiplier_regex, num.text))) + if match and len(match) == 1: + new_length = num.length - \ + (match[0].span()[1] - match[0].span()[0]) + num.text = num.text[0:new_length] + num.length = new_length + + for number in numbers: + if number.start is None or number.length is None: + continue + start = int(number.start) + length = int(number.length) + max_find_pref = min(self.max_prefix_match_len, number.start) + max_find_suff = len(source) - start - length + + if max_find_pref != 0: + last_index = start + best_match = None + + for m in prefix_match: + if m.length > 0 and m.end > start: + break + + if m.length > 0 and source[m.start: m.start + (last_index - m.start)].strip() == m.text: + best_match = m + break + + if best_match is not None: + off_set = last_index - best_match.start + unit_str = source[best_match.start:best_match.start + off_set] + self.add_element(mapping_prefix, number.start, (PrefixUnitResult(off_set, unit_str))) + prefix_unit = mapping_prefix.get(start, None) + if max_find_suff > 0: + + max_len = 0 + first_index = start + length + + for m in suffix_match: + + if m.length > 0 and m.start >= first_index: + + end_pos = m.start + m.length - first_index + if max_len < end_pos: + mid_str = source[first_index: first_index + (m.start - first_index)] + if mid_str is None or not mid_str or str.isspace(mid_str) \ + or mid_str.strip() == self.config.connector_token: + max_len = end_pos + + if max_len != 0: + substr = source[start: start + length + max_len] + er = ExtractResult() + + er.start = start + er.length = length + max_len + er.text = substr + er.type = self.config.extract_type + + if prefix_unit is not None: + prefix_matched = True + er.start -= prefix_unit[0].offset + er.length += prefix_unit[0].offset + er.text = prefix_unit[0].unit + er.text + + # Relative position will be used in Parser + number.start = start - er.start + er.data = number + + # Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension + is_not_unit = False + + if er.type is Constants.SYS_UNIT_DIMENSION: + if non_unit_match is None: + non_unit_match = list(self.config.non_unit_regex.finditer(source)) + for time in non_unit_match: + trimmed_source = source.lower() + index = trimmed_source.index(time.group()) + if er.start >= time.start() and er.start + er.length <= \ + time.start() + len(time.group()): + is_not_unit = True + break + + if is_not_unit: + continue + + result.append(er) + + if prefix_unit and prefix_unit is not None and not prefix_matched: + er = ExtractResult() + er.start = number.start - prefix_unit[0].offset + er.length = number.length + prefix_unit[0].offset + er.text = prefix_unit[0].unit + number.text + er.type = self.config.extract_type + + # Relative position will be used in Parser + number.start = start - er.start + er.data = number + result.append(er) + + # Extract Separate unit if self.separate_regex: - result = self._extract_separate_units(source, result) + if non_unit_match is None: + try: + non_unit_match = list(self.config.non_unit_regex.match(source)) + except: + non_unit_match = [] + + self._extract_separate_units(source, result, non_unit_match) + + # Remove common ambiguous cases + result = self._filter_ambiguity(result, source) return result @@ -223,12 +292,16 @@ def validate_unit(self, source: str) -> bool: def _pre_check_str(self, source: str) -> bool: return len(source) != 0 - def _extract_separate_units(self, source: str, num_depend_source: List[ExtractResult]) -> List[ExtractResult]: + def _extract_separate_units(self, source: str, num_depend_source: List[ExtractResult], non_unit_matches) -> List[ExtractResult]: result = deepcopy(num_depend_source) match_result: List[bool] = [False] * len(source) for ex_result in num_depend_source: - for i in range(ex_result.start, ex_result.end + 1): - match_result[i] = True + start = ex_result.start + i = 0 + while i < ex_result.length: + match_result[start + i] = True + i += 1 + match_collection = list( filter(lambda x: x.group(), regex.finditer(self.separate_regex, source))) for match in match_collection: @@ -241,9 +314,7 @@ def _extract_separate_units(self, source: str, num_depend_source: List[ExtractRe is_not_unit = False if match.group() == Constants.AMBIGUOUS_TIME_TERM: - non_unit_match = self.config.non_unit_regex.finditer( - source) - for time in non_unit_match: + for time in non_unit_matches: if self._dimension_inside_time(match, time): is_not_unit = True @@ -255,12 +326,28 @@ def _extract_separate_units(self, source: str, num_depend_source: List[ExtractRe to_add.length = len(match.group()) to_add.text = match.group() to_add.type = self.config.extract_type - result.append(to_add) - return result + num_depend_source.append(to_add) def _build_regex_from_set(self, definitions: List[str], ignore_case: bool = False) -> Set[Pattern]: return set(map(lambda x: self.__build_regex_from_str(x, ignore_case), definitions)) + def _build_matcher_from_set(self, definitions) -> StringMatcher: + + matcher = StringMatcher(match_strategy=MatchStrategy.TrieTree, tokenizer=NumberWithUnitTokenizer()) + + match_term_list = list(map(lambda words: + list(filter(lambda word: not str.isspace(word) and word is not None, + str(words).strip().split('|'))), + definitions)) + + match_terms = self.distinct(match_term_list) + + flatten = [item for sublist in match_terms for item in sublist] + + matcher.init(flatten) + + return matcher + def __build_regex_from_str(self, source: str, ignore_case: bool) -> Pattern: tokens = map(regex.escape, source.split('|')) definition = '|'.join(tokens) @@ -315,6 +402,50 @@ def _dimension_inside_time(self, dimension: Match, time: Match) -> bool: return is_sub_match + @staticmethod + def distinct(list1): + + # intialize a null list + unique_list = [] + + # traverse for all elements + for x in list1: + # check if exists in unique_list or not + if x not in unique_list: + unique_list.append(x) + # print list + return unique_list + + @staticmethod + def add_element(dict, key, value): + if key not in dict: + dict[key] = [] + dict[key].append(value) + + def _filter_ambiguity(self, ers: List[ExtractResult], text: str,) -> List[ExtractResult]: + + if self.config.ambiguity_filters_dict is not None: + for regex_var in self.config.ambiguity_filters_dict: + regexvar_value = self.config.ambiguity_filters_dict[regex_var] + + try: + reg_match = list(filter(lambda x: x.group(), regex.finditer(regexvar_value, text))) + + if len(reg_match) > 0: + + matches = reg_match + new_ers = list(filter(lambda x: list(filter(lambda m: m.start() < x.start + x.length and m.start() + + len(m.group()) > x.start, matches)), ers)) + if len(new_ers) > 0: + for item in ers: + for i in new_ers: + if item is i: + ers.remove(item) + except Exception: + pass + + return ers + class BaseMergedUnitExtractor(Extractor): def __init__(self, config: NumberWithUnitExtractorConfiguration): @@ -341,24 +472,28 @@ def __merged_compound_units(self, source: str): idx = idx + 1 continue - if isinstance(ers[idx].data, ExtractResult): + if isinstance(ers[idx].data, ExtractResult) and not str(ers[idx].data.data).startswith("Integer"): groups[idx + 1] = groups[idx] + 1 idx = idx + 1 continue middle_begin = ers[idx].start + ers[idx].length - middle_end = ers[idx].start + middle_end = ers[idx + 1].start - middle_str = source[middle_begin:middle_end - - middle_begin].strip().lower() + middle_str = source[middle_begin: middle_begin + (middle_end - + middle_begin)].strip().lower() # Separated by whitespace if not middle_str: groups[idx + 1] = groups[idx] + idx = idx + 1 + continue # Separated by connector match = self.config.compound_unit_connector_regex.match(middle_str) - if match: + if match is not None: + splitted_match = match.string.split(" ") + if match and match.pos == 0 and len(splitted_match[0]) == len(middle_str): groups[idx + 1] = groups[idx] else: groups[idx + 1] = groups[idx] + 1 @@ -387,7 +522,7 @@ def __merged_compound_units(self, source: str): period_end = ers[idx + 1].start + ers[idx + 1].length result[group].length = period_end - period_begin - result[group].text = source[period_begin:period_end - period_begin] + result[group].text = source[period_begin:period_begin + (period_end - period_begin)] result[group].type = Constants.SYS_UNIT_CURRENCY if isinstance(result[group].data, list): result[group].data.append(ers[idx + 1]) @@ -423,8 +558,8 @@ def __merge_pure_number(self, source: str, ers: List[ExtractResult]) -> List[Ext middle_begin = ers[j - 1].start + ers[j - 1].length middle_end = num_ers[i].start - middle_str = source[middle_begin:middle_end - - middle_begin].strip().lower() + middle_str = source[middle_begin: middle_begin + (middle_end - + middle_begin)].strip().lower() # separated by whitespace if not middle_str: @@ -432,6 +567,14 @@ def __merge_pure_number(self, source: str, ers: List[ExtractResult]) -> List[Ext i = i + 1 continue + match = self.config.compound_unit_connector_regex.match(middle_str) + if match is not None: + splitted_match = match.string.split(" ") + if match and match.pos == 0 and len(splitted_match[0]) == len(middle_str): + unit_numbers.append(num_ers[i]) + i = i + 1 + continue + i = i + 1 for extract_result in unit_numbers: diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/french/extractors.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/french/extractors.py index 6981b9cf9d..808168cf01 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/french/extractors.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/french/extractors.py @@ -14,6 +14,11 @@ # pylint: disable=abstract-method class FrenchNumberWithUnitExtractorConfiguration(NumberWithUnitExtractorConfiguration): + + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + return FrenchNumericWithUnit.AmbiguityFiltersDict + @property def unit_num_extractor(self) -> Extractor: return self._unit_num_extractor diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/models.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/models.py index 5cd313d163..9799ddf08c 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/models.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/models.py @@ -1,6 +1,5 @@ from abc import abstractmethod from typing import List -from collections import namedtuple from recognizers_text.model import Model, ModelResult from recognizers_text.extractor import Extractor @@ -26,14 +25,12 @@ def __init__(self, extractor_parser: List[ExtractorParserModel]): def parse(self, query: str) -> List[ModelResult]: query = QueryProcessor.preprocess(query, True) - extraction_results = [] try: for item in self.extractor_parser: extract_results = item.extractor.extract(query) - parse_results = [r for r in [item.parser.parse( - r) for r in extract_results] if not r.value is None] + parse_results = [r for r in [item.parser.parse(r) for r in extract_results] if not r.value is None] for parse_result in parse_results: model_result = ModelResult() @@ -54,7 +51,8 @@ def parse(self, query: str) -> List[ModelResult]: return extraction_results - def get_resolution(self, data): + @staticmethod + def get_resolution(data): if isinstance(data, str): return { 'value': data @@ -70,6 +68,19 @@ def get_resolution(self, data): 'unit': data.unit, 'isoCurrency': data.iso_currency } + elif isinstance(data, list): + if hasattr(data[0].value, 'iso_currency'): + return { + 'value': data[0].value.number, + 'unit': data[0].value.unit, + 'isoCurrency': data[0].value.iso_currency + } + else: + return { + 'value': data[0].value.number, + 'unit': data[0].value.unit + } + return None diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/parsers.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/parsers.py index 2a21d89e15..b354cd40fc 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/parsers.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/parsers.py @@ -51,6 +51,9 @@ def parse(self, source: ExtractResult) -> Optional[ParseResult]: number_result = None if source.data and isinstance(source.data, ExtractResult): number_result = source.data + elif source.type == Constants.SYS_NUM: + ret.value = self.config.internal_number_parser.parse(source).value + return ret else: # if there is no unitResult, means there is just unit number_result = ExtractResult() number_result.start = -1 @@ -145,12 +148,15 @@ def __merge_compound_unit(self, compound_result: ExtractResult) -> ParseResult: fraction_unit_string = '' idx = 0 + while idx < len(compound_unit): extract_result = compound_unit[idx] parse_result = self.number_with_unit_parser.parse(extract_result) parse_result_value = parse_result.value - unit_value = parse_result_value.unit if parse_result_value else None - + try: + unit_value = parse_result_value.unit if parse_result_value else None + except AttributeError: + unit_value = None # Process a new group if count == 0: if not extract_result.type == Constants.SYS_UNIT_CURRENCY: @@ -186,7 +192,7 @@ def __merge_compound_unit(self, compound_result: ExtractResult) -> ParseResult: if extract_result.type == Constants.SYS_NUM: number_value = number_value + \ float(parse_result.value) * (1 / 100) - result.resolution_str = result.resolution_str + ' ' + parse_result.resolution_str + result.resolution_str = result.resolution_str + ' ' + str(parse_result.resolution_str or '') result.length = parse_result.start + parse_result.length - result.start count = count + 1 idx = idx + 1 @@ -202,7 +208,7 @@ def __merge_compound_unit(self, compound_result: ExtractResult) -> ParseResult: number_value = number_value + ( float(parse_result_value.number) * (1 / fraction_num_value) if parse_result_value else 0) result.resolution_str = result.resolution_str + ' ' + parse_result.resolution_str - result.length = parse_result.start + parse_result.length - result.length + result.length = parse_result.start + parse_result.length - result.start else: if result: if not main_unit_iso_code or main_unit_iso_code.startswith(Constants.FAKE_ISO_CODE_PREFIX): @@ -234,6 +240,7 @@ def __merge_compound_unit(self, compound_result: ExtractResult) -> ParseResult: compound_result.start) ret = ParseResult(compound_result) + ret.value = results return ret diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/portuguese/extractors.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/portuguese/extractors.py index 0a1d8044cd..41f2b39c88 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/portuguese/extractors.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/portuguese/extractors.py @@ -14,6 +14,10 @@ # pylint: disable=abstract-method class PortugueseNumberWithUnitExtractorConfiguration(NumberWithUnitExtractorConfiguration): + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + return None + @property def unit_num_extractor(self) -> Extractor: return self._unit_num_extractor diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/spanish/extractors.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/spanish/extractors.py index a61b8ab80f..f0c6b48f4a 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/spanish/extractors.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/spanish/extractors.py @@ -14,6 +14,10 @@ # pylint: disable=abstract-method class SpanishNumberWithUnitExtractorConfiguration(NumberWithUnitExtractorConfiguration): + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + return None + @property def unit_num_extractor(self) -> Extractor: return self._unit_num_extractor diff --git a/Python/libraries/recognizers-number/recognizers_number/number/extractors.py b/Python/libraries/recognizers-number/recognizers_number/number/extractors.py index e449701d80..3983054e3e 100644 --- a/Python/libraries/recognizers-number/recognizers_number/number/extractors.py +++ b/Python/libraries/recognizers-number/recognizers_number/number/extractors.py @@ -37,14 +37,13 @@ def extract(self, source: str) -> List[ExtractResult]: if source is None or len(source.strip()) is 0: return list() result: List[ExtractResult] = list() - match_source: Dict[[Match], str] = dict() + match_source = dict() matched: List[bool] = [False] * len(source) matches_list = list(map( lambda x: MatchesVal(matches=list(regex.finditer(x.re, source)), val=x.val), self.regexes)) matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list)) - for ml in matches_list: for m in ml.matches: for j in range(len(m.group())): diff --git a/Python/libraries/recognizers-text/recognizers_text/Matcher/number_with_unit_tokenizer.py b/Python/libraries/recognizers-text/recognizers_text/Matcher/number_with_unit_tokenizer.py index baa5ed340d..3da910a6ab 100644 --- a/Python/libraries/recognizers-text/recognizers_text/Matcher/number_with_unit_tokenizer.py +++ b/Python/libraries/recognizers-text/recognizers_text/Matcher/number_with_unit_tokenizer.py @@ -25,18 +25,18 @@ def tokenize(self, input: str) -> []: c = chars[i] if str.isspace(c): if in_token: - tokens.append(Token(token_start, i - token_start, input[token_start: i - token_start])) + tokens.append(Token(token_start, i - token_start, input[token_start: token_start + (i - token_start)])) in_token = False - elif not (c in self.special_tokens_characters) and (str.isdigit(c) or str.isalpha(c)) or \ + elif not (c in self.special_tokens_characters) and not (str.isdigit(c) or str.isalpha(c)) or \ self.is_chinese(c) or self.is_japanese(c): # Non-splittable currency units (as "$") are treated as regular letters. For instance, 'us$' should be # a single token if in_token: - tokens.append(Token(token_start, i - token_start, input[token_start: i - token_start])) + tokens.append(Token(token_start, i - token_start, input[token_start: token_start + (i - token_start)])) in_token = False - tokens.append(Token(i, 1, input[i:1])) + tokens.append(Token(i, 1, input[i: token_start + (i - token_start) + 1])) else: if in_token and i > 0: @@ -44,14 +44,14 @@ def tokenize(self, input: str) -> []: if self.is_splittable_unit(c, pre_char): # Split if letters or non-splittable units are adjacent with digits. - tokens.append(Token(token_start, i - token_start, input[token_start: i - token_start])) + tokens.append(Token(token_start, i - token_start, input[token_start: token_start + (i - token_start)])) token_start = i if not in_token: token_start = i in_token = True if in_token: - tokens.append(Token(token_start, len(chars) - token_start, input[token_start: len(chars) - token_start])) + tokens.append(Token(token_start, len(chars) - token_start, input[token_start: token_start + (len(chars) - token_start)])) return tokens diff --git a/Python/libraries/recognizers-text/recognizers_text/Matcher/string_matcher.py b/Python/libraries/recognizers-text/recognizers_text/Matcher/string_matcher.py index 7e29857541..c216dc4058 100644 --- a/Python/libraries/recognizers-text/recognizers_text/Matcher/string_matcher.py +++ b/Python/libraries/recognizers-text/recognizers_text/Matcher/string_matcher.py @@ -71,7 +71,6 @@ def find(self, query_text: str = "") -> []: query_tokens = self.__tokenizer.tokenize(query_text) tokenized_query_text = list(map(lambda t: t.text, query_tokens)) result = [] - for r in self.find(tokenized_query_text): start_token = query_tokens[r.start] end_token = query_tokens[r.start + r.length - 1] @@ -88,4 +87,4 @@ def find(self, query_text: str = "") -> []: return result def get_tokenized_text(self, values: []) -> []: - return list(map(lambda t: map(lambda i: i.text, self.tokenizer.tokenize(t)), values)) + return list(map(lambda t: list(map(lambda i: i.text, self.tokenizer.tokenize(t))), values)) diff --git a/Python/libraries/recognizers-text/recognizers_text/matcher/abstract_matcher.py b/Python/libraries/recognizers-text/recognizers_text/matcher/abstract_matcher.py new file mode 100644 index 0000000000..2c5e313d72 --- /dev/null +++ b/Python/libraries/recognizers-text/recognizers_text/matcher/abstract_matcher.py @@ -0,0 +1,28 @@ +from .matcher import Matcher +from abc import abstractmethod +from .node import Node + + +class AbstractMatcher(Matcher): + @abstractmethod + def init(self, values: [str], ids: [str]): + raise NotImplementedError + + @abstractmethod + def find(self, query_text: [str]) -> [str]: + raise NotImplementedError + + @abstractmethod + def insert(self, value: [str], id: str): + raise NotImplementedError + + def is_match(self, query_text: [str]): + result = next((e for e in self.find(query_text) if e is None), None) + return result + + def batch_insert(self, values: [], ids: []): + if len(values) != len(ids): + raise Exception('Lengths of Values and Ids are different.') + + for i in range(0, len(values)): + self.insert(values[i], ids[i]) diff --git a/Python/libraries/recognizers-text/recognizers_text/matcher/matcher.py b/Python/libraries/recognizers-text/recognizers_text/matcher/matcher.py new file mode 100644 index 0000000000..45a17f4500 --- /dev/null +++ b/Python/libraries/recognizers-text/recognizers_text/matcher/matcher.py @@ -0,0 +1,14 @@ +from abc import ABC, abstractmethod +from typing import List +from recognizers_text.model import ModelResult + + +class Matcher(ABC): + + @abstractmethod + def init(self, values: [], ids: []) -> None: + raise NotImplementedError + + @abstractmethod + def find(self, query_text: []) -> []: + raise NotImplementedError diff --git a/Python/tests/matcher/test_string_matcher.py b/Python/tests/matcher/test_string_matcher.py index 5bfedff6c6..09e5890f89 100644 --- a/Python/tests/matcher/test_string_matcher.py +++ b/Python/tests/matcher/test_string_matcher.py @@ -25,7 +25,7 @@ def test_simple_with_ids_string_matcher(): match = string_matcher.find(value) if match is not None: assert value == match[0].text - assert ids[i] == match[0].canonical_values.pop() + assert ids[i] == match[0].canonical_values[0] @staticmethod def test_string_matcher(): diff --git a/Python/tests/runner.py b/Python/tests/runner.py index eb3f635582..f8e6e96f1b 100644 --- a/Python/tests/runner.py +++ b/Python/tests/runner.py @@ -33,7 +33,6 @@ def get_suite_config(json_path): def get_suite(json_path): - print(json_path) return {'specs': json.load( open(json_path, encoding='utf-8-sig')), 'config': get_suite_config(json_path)} diff --git a/Specs/NumberWithUnit/Chinese/CurrencyModel.json b/Specs/NumberWithUnit/Chinese/CurrencyModel.json index 959b9d92c5..0264837796 100644 --- a/Specs/NumberWithUnit/Chinese/CurrencyModel.json +++ b/Specs/NumberWithUnit/Chinese/CurrencyModel.json @@ -161,7 +161,7 @@ }, { "Input": "10日元5日本銭", - "NotSupported": "javascript, python, java", + "NotSupported": "javascript, java", "Results": [ { "Text": "10日元5日本銭", @@ -258,7 +258,7 @@ }, { "Input": "这台电脑两美元又三美分", - "NotSupported": "javascript, python", + "NotSupported": "javascript", "Results": [ { "Text": "两美元又三美分", @@ -275,7 +275,7 @@ }, { "Input": "这个手机壳五元三毛", - "NotSupported": "javascript, python", + "NotSupported": "javascript", "Results": [ { "Text": "五元三毛", @@ -292,7 +292,7 @@ }, { "Input": "这个手机壳花费你五美元和花费我三块", - "NotSupported": "javascript, python", + "NotSupported": "javascript", "Results": [ { "Text": "五美元", diff --git a/Specs/NumberWithUnit/English/CurrencyModel.json b/Specs/NumberWithUnit/English/CurrencyModel.json index 47ec20879f..641f66bc1f 100644 --- a/Specs/NumberWithUnit/English/CurrencyModel.json +++ b/Specs/NumberWithUnit/English/CurrencyModel.json @@ -1436,7 +1436,6 @@ }, { "Input": "dollar : 143.80 yen , up 0 . 95 ; 1 . 8500 marks , up 0 . 0085 .", - "NotSupported": "python", "Results": [ { "Text": "dollar", @@ -1463,7 +1462,6 @@ }, { "Input": "it only cost 3 dollars 50 cents.", - "NotSupported": "python", "Results": [ { "Text": "3 dollars 50 cents", @@ -1479,7 +1477,6 @@ }, { "Input": "it only cost thirteen dollars and forty-five cents", - "NotSupported": "python", "Results": [ { "Text": "thirteen dollars and forty-five cents", @@ -1495,7 +1492,6 @@ }, { "Input": "it only cost thirteen dollars forty-five cents", - "NotSupported": "python", "Results": [ { "Text": "thirteen dollars forty-five cents", @@ -1511,7 +1507,6 @@ }, { "Input": "it only cost thirteen dollars forty five", - "NotSupported": "python", "Results": [ { "Text": "thirteen dollars forty five", @@ -1527,7 +1522,6 @@ }, { "Input": "It costs one dollar and one and one point of your credit points.", - "NotSupported": "python", "Results": [ { "Text": "one dollar and one", @@ -1571,7 +1565,6 @@ }, { "Input": "It costs you 10 us dollar and me c $ 100 and fifty.", - "NotSupported": "python", "Results": [ { "Text": "10 us dollar", @@ -1599,7 +1592,7 @@ }, { "Input": "It may need one kuai and five mao five.", - "NotSupported": "javascript, python", + "NotSupported": "javascript", "Results": [ { "Text": "one kuai and five mao five", @@ -1616,7 +1609,6 @@ }, { "Input": "It costs one dollar and two and three points of your credit points.", - "NotSupported": "python", "Results": [ { "Text": "one dollar and two", @@ -1632,7 +1624,6 @@ }, { "Input": "Hey, the cost is $4.25 and 32 is the quantity!", - "NotSupported": "python", "Results": [ { "Text": "$4.25", diff --git a/Specs/NumberWithUnit/English/DimensionModel.json b/Specs/NumberWithUnit/English/DimensionModel.json index dd238a80ba..c8ecea0f3f 100644 --- a/Specs/NumberWithUnit/English/DimensionModel.json +++ b/Specs/NumberWithUnit/English/DimensionModel.json @@ -766,7 +766,7 @@ }, { "Input": "I'll give you a surprise at 2:00 pm", - "NotSupported": "python, java", + "NotSupported": "java", "Results": [] }, { @@ -796,7 +796,7 @@ }, { "Input": "that one mile can provide.", - "NotSupported": "javascript, python, java", + "NotSupported": "javascript, java", "Results": [ { "Text": "one mile", @@ -812,7 +812,7 @@ }, { "Input": "I ' m tired", - "NotSupported": "javascript, python, java", + "NotSupported": "javascript, java", "Results": [] }, { diff --git a/Specs/NumberWithUnit/French/CurrencyModel.json b/Specs/NumberWithUnit/French/CurrencyModel.json index 1504963f33..f08504350c 100644 --- a/Specs/NumberWithUnit/French/CurrencyModel.json +++ b/Specs/NumberWithUnit/French/CurrencyModel.json @@ -1531,7 +1531,7 @@ }, { "Input": "pour cent", - "NotSupported": "javascript,python,java", + "NotSupported": "javascript,java", "Results": [] }, { diff --git a/Specs/NumberWithUnit/French/DimensionModel.json b/Specs/NumberWithUnit/French/DimensionModel.json index 830c18bb32..7c28e4523e 100644 --- a/Specs/NumberWithUnit/French/DimensionModel.json +++ b/Specs/NumberWithUnit/French/DimensionModel.json @@ -601,7 +601,7 @@ }, { "Input": "En 1995 le canon a présenté la première lentille slr disponible dans le commerce avec la stabilisation d'image interne, ef 75 - 300mm f / 4 - 5. 6 est usm.", - "NotSupported": "javascript, python, java", + "NotSupported": "javascript, java", "Results": [ { "Text": "300mm", diff --git a/Specs/NumberWithUnit/Portuguese/DimensionModel.json b/Specs/NumberWithUnit/Portuguese/DimensionModel.json index f9e7d2f3e7..593dfdaa41 100644 --- a/Specs/NumberWithUnit/Portuguese/DimensionModel.json +++ b/Specs/NumberWithUnit/Portuguese/DimensionModel.json @@ -586,7 +586,7 @@ }, { "Input": "Em 1995 a Cannon introduziu a primeira lente SLR disponível comercialmente com estabilização de imagem interna, 75 - 300 mm f / 4 - 5. 6 es usm.", - "NotSupported": "javascript, python, java", + "NotSupported": "javascript, java", "Results": [ { "Text": "300 mm", diff --git a/Specs/NumberWithUnit/Spanish/DimensionModel.json b/Specs/NumberWithUnit/Spanish/DimensionModel.json index 88d0a7717f..4238220291 100644 --- a/Specs/NumberWithUnit/Spanish/DimensionModel.json +++ b/Specs/NumberWithUnit/Spanish/DimensionModel.json @@ -556,7 +556,7 @@ }, { "Input": "En 1995 Cannon introdujo la primera lente SLR comercialmente disponible con estabilización de imagen interna, 75 - 300 mm f / 4 - 5. 6 es usm.", - "NotSupported": "javascript, python, java", + "NotSupported": "javascript, java", "Results": [ { "Text": "300 mm",