diff --git a/snips_nlu/__init__.py b/snips_nlu/__init__.py index 6ad21f661..536c4cf0d 100644 --- a/snips_nlu/__init__.py +++ b/snips_nlu/__init__.py @@ -3,6 +3,7 @@ from duckling import core +from snips_nlu.resources import load_resources from snips_nlu.utils import ROOT_PATH, PACKAGE_NAME core.load() @@ -11,3 +12,5 @@ with io.open(os.path.join(ROOT_PATH, PACKAGE_NAME, VERSION_FILE_NAME)) as f: __version__ = f.readline().strip() + +load_resources() diff --git a/snips_nlu/__version__ b/snips_nlu/__version__ index 87a087111..448a0fa11 100644 --- a/snips_nlu/__version__ +++ b/snips_nlu/__version__ @@ -1 +1 @@ -0.3.3 \ No newline at end of file +0.3.4 \ No newline at end of file diff --git a/snips_nlu/constants.py b/snips_nlu/constants.py index 5d472df37..ac6caf814 100644 --- a/snips_nlu/constants.py +++ b/snips_nlu/constants.py @@ -1,28 +1,33 @@ -INTENT_NAME = u"intent_name" -PROBABILITY = u"probability" -PARSED_INTENT = u"intent" -PARSED_SLOTS = u"slots" -TEXT = u"text" -AUTOMATICALLY_EXTENSIBLE = u"automatically_extensible" -USE_SYNONYMS = u"use_synonyms" -SYNONYMS = u"synonyms" -DATA = u"data" -INTENTS = u"intents" -ENTITIES = u"entities" -ENTITY = u"entity" -SLOT_NAME = u"slot_name" -UTTERANCES = u"utterances" -ENGINE_TYPE = u"engineType" -CUSTOM_ENGINE = u"regex" -BUILTIN_ENGINE = u"tensorflow" -LANGUAGE = u"language" -MATCH_RANGE = u"range" -VALUE = u"value" -CUSTOM_PARSERS = u"custom_parsers" -BUILTIN_PARSER = u"builtin_parser" -BUILTIN_PATH = u"builtin_path" -BUILTIN_BINARY = u"builtin_binary" -LABEL = u"label" -DUCKLING_DIM = u"duckling_dim" -NGRAM = u"ngram" -TOKEN_INDEXES = u"token_indexes" +from __future__ import unicode_literals + +INTENT_NAME = "intent_name" +PROBABILITY = "probability" +PARSED_INTENT = "intent" +PARSED_SLOTS = "slots" +TEXT = "text" +AUTOMATICALLY_EXTENSIBLE = "automatically_extensible" +USE_SYNONYMS = "use_synonyms" +SYNONYMS = "synonyms" +DATA = "data" +INTENTS = "intents" +ENTITIES = "entities" +ENTITY = "entity" +SLOT_NAME = "slot_name" +UTTERANCES = "utterances" +ENGINE_TYPE = "engineType" +CUSTOM_ENGINE = "regex" +BUILTIN_ENGINE = "tensorflow" +LANGUAGE = "language" +MATCH_RANGE = "range" +VALUE = "value" +BUILTIN_PARSER = "builtin_parser" +BUILTIN_PATH = "builtin_path" +BUILTIN_BINARY = "builtin_binary" +LABEL = "label" +DUCKLING_DIM = "duckling_dim" +NGRAM = "ngram" +TOKEN_INDEXES = "token_indexes" +GAZETTEERS = "gazetteers" +STOP_WORDS = "stop_words" +SUBTITLES = "subtitles" +WORD_CLUSTERS = "word_clusters" diff --git a/snips_nlu/intent_classifier/data_augmentation.py b/snips_nlu/intent_classifier/data_augmentation.py index f66274351..84edd7e4f 100644 --- a/snips_nlu/intent_classifier/data_augmentation.py +++ b/snips_nlu/intent_classifier/data_augmentation.py @@ -1,10 +1,11 @@ -import numpy as np from uuid import uuid4 + +import numpy as np + from snips_nlu.constants import INTENTS, UTTERANCES, DATA from snips_nlu.dataset import get_text_from_chunks -from snips_nlu.intent_classifier.intent_classifier_resources import \ - get_subtitles from snips_nlu.preprocessing import stem_sentence +from snips_nlu.resources import get_subtitles NOISE_NAME = str(uuid4()).decode() diff --git a/snips_nlu/intent_classifier/feature_extraction.py b/snips_nlu/intent_classifier/feature_extraction.py index fbcfb1847..824ad3026 100644 --- a/snips_nlu/intent_classifier/feature_extraction.py +++ b/snips_nlu/intent_classifier/feature_extraction.py @@ -1,7 +1,7 @@ from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.feature_selection import chi2 -from intent_classifier_resources import get_stop_words +from snips_nlu.resources import get_stop_words from snips_nlu.languages import Language from snips_nlu.utils import ensure_string, safe_pickle_dumps, safe_pickle_loads from snips_nlu.tokenization import tokenize_light diff --git a/snips_nlu/intent_classifier/intent_classifier_resources.py b/snips_nlu/intent_classifier/intent_classifier_resources.py deleted file mode 100644 index 603dfedbb..000000000 --- a/snips_nlu/intent_classifier/intent_classifier_resources.py +++ /dev/null @@ -1,31 +0,0 @@ -import io -import os - -from snips_nlu.utils import get_resources_path - -STOP_WORDS = dict() -SUBTITLES = dict() - - -def get_stop_words(language): - global STOP_WORDS - if language.iso_code not in STOP_WORDS: - stop_words_file_path = os.path.join( - get_resources_path(language), 'stop_words.txt') - with io.open(stop_words_file_path, encoding='utf8') as f: - lines = [l.strip() for l in f] - STOP_WORDS[language.iso_code] = set(l for l in lines if len(l) > 0) - - return STOP_WORDS[language.iso_code] - - -def get_subtitles(language): - global SUBTITLES - if language.iso_code not in SUBTITLES: - subtitles_file_path = os.path.join( - get_resources_path(language), 'subtitles.txt') - with io.open(subtitles_file_path, encoding='utf8') as f: - lines = [l.strip() for l in f] - SUBTITLES[language.iso_code] = set(l for l in lines if len(l) > 0) - - return SUBTITLES[language.iso_code] diff --git a/snips_nlu/intent_parser/crf_intent_parser.py b/snips_nlu/intent_parser/probabilistic_intent_parser.py similarity index 78% rename from snips_nlu/intent_parser/crf_intent_parser.py rename to snips_nlu/intent_parser/probabilistic_intent_parser.py index 5075afe0e..aacb30f15 100644 --- a/snips_nlu/intent_parser/crf_intent_parser.py +++ b/snips_nlu/intent_parser/probabilistic_intent_parser.py @@ -1,9 +1,7 @@ from intent_parser import IntentParser -from snips_nlu.constants import (DATA, INTENTS, SLOT_NAME, UTTERANCES, ENTITY, - CUSTOM_ENGINE) +from snips_nlu.constants import (DATA, INTENTS, CUSTOM_ENGINE) from snips_nlu.dataset import filter_dataset from snips_nlu.languages import Language -from snips_nlu.result import ParsedSlot from snips_nlu.slot_filler.crf_tagger import CRFTagger from snips_nlu.slot_filler.crf_utils import (tags_to_slots, utterance_to_sample) @@ -41,21 +39,10 @@ def default_data_augmentation_config(language): return DataAugmentationConfig() -def get_slot_name_to_entity_mapping(dataset): - slot_name_to_entity = dict() - for intent in dataset[INTENTS].values(): - for utterance in intent[UTTERANCES]: - for chunk in utterance[DATA]: - if SLOT_NAME in chunk: - slot_name_to_entity[chunk[SLOT_NAME]] = chunk[ENTITY] - return slot_name_to_entity - - -class CRFIntentParser(IntentParser): +class ProbabilisticIntentParser(IntentParser): def __init__(self, language, intent_classifier, crf_taggers, - slot_name_to_entity_mapping=None, - data_augmentation_config=None): - super(CRFIntentParser, self).__init__() + slot_name_to_entity_mapping, data_augmentation_config=None): + super(ProbabilisticIntentParser, self).__init__() self.language = language self.intent_classifier = intent_classifier self._crf_taggers = None @@ -78,7 +65,7 @@ def crf_taggers(self, value): def get_intent(self, text): if not self.fitted: - raise ValueError("CRFIntentParser must be fitted before " + raise ValueError("ProbabilisticIntentParser must be fitted before " "`get_intent` is called") return self.intent_classifier.get_intent(text) @@ -86,21 +73,20 @@ def get_slots(self, text, intent=None): if intent is None: raise ValueError("intent can't be None") if not self.fitted: - raise ValueError("CRFIntentParser must be fitted before " + raise ValueError("ProbabilisticIntentParser must be fitted before " "`get_slots` is called") if intent not in self.crf_taggers: raise KeyError("Invalid intent '%s'" % intent) + tokens = tokenize(text) + if len(tokens) == 0: + return [] + intent_slots_mapping = self.slot_name_to_entity_mapping[intent] tagger = self.crf_taggers[intent] - tags = tagger.get_tags(tokens) - slots = tags_to_slots(tokens, tags, - tagging_scheme=tagger.tagging_scheme) - return [ParsedSlot(match_range=s["range"], - value=text[s["range"][0]:s["range"][1]], - entity=self.slot_name_to_entity_mapping[ - s[SLOT_NAME]], - slot_name=s[SLOT_NAME]) for s in slots] + slots = tags_to_slots(text, tokens, tags, tagger.tagging_scheme, + intent_slots_mapping) + return slots @property def fitted(self): @@ -109,8 +95,6 @@ def fitted(self): def fit(self, dataset): custom_dataset = filter_dataset(dataset, CUSTOM_ENGINE) - self.slot_name_to_entity_mapping = get_slot_name_to_entity_mapping( - custom_dataset) self.intent_classifier = self.intent_classifier.fit(dataset) for intent_name in custom_dataset[INTENTS]: augmented_intent_utterances = augment_utterances( diff --git a/snips_nlu/intent_parser/regex_intent_parser.py b/snips_nlu/intent_parser/regex_intent_parser.py index 2cfbb3a2e..14882cc4d 100644 --- a/snips_nlu/intent_parser/regex_intent_parser.py +++ b/snips_nlu/intent_parser/regex_intent_parser.py @@ -1,10 +1,9 @@ -import operator import re +from snips_nlu.built_in_entities import BuiltInEntity from snips_nlu.constants import (TEXT, USE_SYNONYMS, SYNONYMS, DATA, INTENTS, ENTITIES, SLOT_NAME, UTTERANCES, VALUE, ENTITY, CUSTOM_ENGINE) -from snips_nlu.built_in_entities import BuiltInEntity from snips_nlu.dataset import filter_dataset from snips_nlu.intent_parser.intent_parser import IntentParser from snips_nlu.result import (IntentClassificationResult, @@ -86,8 +85,9 @@ def get_joined_entity_utterances(dataset): for syn in entry[SYNONYMS]] else: utterances = [entry[VALUE] for entry in entity[DATA]] + utterances_patterns = [re.escape(e) for e in utterances] joined_entity_utterances[entity_name] = r"|".join( - sorted([re.escape(e) for e in utterances], key=len, reverse=True)) + sorted(utterances_patterns, key=len, reverse=True)) return joined_entity_utterances @@ -146,24 +146,12 @@ def get_intent(self, text): if not self.fitted: raise AssertionError("RegexIntentParser must be fitted before " "calling `get_entities`") - entities_per_intent = dict() - for intent in self.regexes_per_intent.keys(): - entities_per_intent[intent] = self.get_slots(text, intent) - - intents_probas = dict() - total_nb_entities = sum( - len(entities) for entities in entities_per_intent.values()) - # TODO: handle intents without slots - if total_nb_entities == 0: - return None - for intent_name, entities in entities_per_intent.iteritems(): - intents_probas[intent_name] = float(len(entities)) / float( - total_nb_entities) - - top_intent, top_proba = max(intents_probas.items(), - key=operator.itemgetter(1)) - return IntentClassificationResult(intent_name=top_intent, - probability=top_proba) + for intent, regexes in self.regexes_per_intent.iteritems(): + for regex in regexes: + if regex.match(text) is not None: + return IntentClassificationResult(intent_name=intent, + probability=1.0) + return None def get_slots(self, text, intent=None): if not self.fitted: diff --git a/snips_nlu/nlu_engine.py b/snips_nlu/nlu_engine.py index f3e237245..fa7b8fa22 100644 --- a/snips_nlu/nlu_engine.py +++ b/snips_nlu/nlu_engine.py @@ -1,22 +1,30 @@ +from __future__ import unicode_literals + from abc import ABCMeta, abstractmethod +from copy import copy +from itertools import groupby, permutations from dataset import validate_and_format_dataset, filter_dataset from snips_nlu.built_in_entities import BuiltInEntity, get_built_in_entities from snips_nlu.constants import ( - USE_SYNONYMS, SYNONYMS, DATA, INTENTS, ENTITIES, UTTERANCES, - LANGUAGE, VALUE, AUTOMATICALLY_EXTENSIBLE, ENTITY, BUILTIN_PARSER, - CUSTOM_PARSERS, CUSTOM_ENGINE, MATCH_RANGE) + INTENTS, ENTITIES, UTTERANCES, LANGUAGE, VALUE, AUTOMATICALLY_EXTENSIBLE, + ENTITY, BUILTIN_PARSER, CUSTOM_ENGINE, MATCH_RANGE, DATA, SLOT_NAME, + USE_SYNONYMS, SYNONYMS) from snips_nlu.intent_classifier.snips_intent_classifier import \ SnipsIntentClassifier from snips_nlu.intent_parser.builtin_intent_parser import BuiltinIntentParser -from snips_nlu.intent_parser.crf_intent_parser import CRFIntentParser +from snips_nlu.intent_parser.probabilistic_intent_parser import \ + ProbabilisticIntentParser from snips_nlu.intent_parser.regex_intent_parser import RegexIntentParser from snips_nlu.languages import Language -from snips_nlu.result import ParsedSlot +from snips_nlu.result import ParsedSlot, empty_result, \ + IntentClassificationResult from snips_nlu.result import Result from snips_nlu.slot_filler.crf_tagger import CRFTagger, default_crf_model -from snips_nlu.slot_filler.crf_utils import TaggingScheme +from snips_nlu.slot_filler.crf_utils import TaggingScheme, positive_tagging, \ + tags_to_slots from snips_nlu.slot_filler.feature_functions import crf_features +from snips_nlu.tokenization import tokenize from snips_nlu.utils import instance_from_dict @@ -46,15 +54,31 @@ def parse(self, text): pass -def _parse(text, parsers, entities): +def _parse(text, entities, rule_based_parser=None, probabilistic_parser=None, + builtin_parser=None, intent=None): + parsers = [] + if rule_based_parser is not None: + parsers.append(rule_based_parser) + if probabilistic_parser is not None: + parsers.append(probabilistic_parser) + + if intent is None and builtin_parser is not None: # if the intent is given + # it's a custom intent + parsers.append(builtin_parser) if len(parsers) == 0: - return Result(text, parsed_intent=None, parsed_slots=None) + return empty_result(text) + for parser in parsers: - res = parser.get_intent(text) - if res is None: - continue - slots = parser.get_slots(text, res.intent_name) + if intent is None: + res = parser.get_intent(text) + if res is None: + continue + intent_name = res.intent_name + else: + res = IntentClassificationResult(intent, 1.0) + intent_name = intent valid_slot = [] + slots = parser.get_slots(text, intent_name) for s in slots: slot_value = s.value # Check if the entity is from a custom intent @@ -68,7 +92,66 @@ def _parse(text, parsers, entities): s.slot_name) valid_slot.append(s) return Result(text, parsed_intent=res, parsed_slots=valid_slot) - return Result(text, parsed_intent=None, parsed_slots=None) + return empty_result(text) + + +def augment_slots(text, tagger, intent_slots_mapping, builtin_entities, + missing_slots): + tokens = tokenize(text) + # TODO: Find a way to avoid tagging multiple times + tags = tagger.get_tags(tokens) + augmented_tags = tags + grouped_entities = groupby(builtin_entities, key=lambda s: s[ENTITY]) + for entity, matches in grouped_entities: + spans_ranges = [match[MATCH_RANGE] for match in matches] + tokens_indexes = spans_to_tokens_indexes(spans_ranges, tokens) + related_slots = set(s for s in missing_slots + if intent_slots_mapping[s] == entity.label) + slots_permutations = permutations(related_slots) + best_updated_tags = augmented_tags + best_permutation_score = -1 + for slots in slots_permutations: + updated_tags = copy(augmented_tags) + for slot_index, slot in enumerate(slots): + if slot_index >= len(tokens_indexes): + break + indexes = tokens_indexes[slot_index] + sub_tags_sequence = positive_tagging(tagger.tagging_scheme, + slot, len(indexes)) + updated_tags[indexes[0]:indexes[-1] + 1] = sub_tags_sequence + score = tagger.get_sequence_probability(tokens, updated_tags) + if score > best_permutation_score: + best_updated_tags = updated_tags + best_permutation_score = score + augmented_tags = best_updated_tags + return tags_to_slots(text, tokens, augmented_tags, tagger.tagging_scheme, + intent_slots_mapping) + + +def spans_to_tokens_indexes(spans, tokens): + tokens_indexes = [] + for span_start, span_end in spans: + indexes = [] + for i, token in enumerate(tokens): + if span_end > token.start and span_start < token.end: + indexes.append(i) + tokens_indexes.append(indexes) + return tokens_indexes + + +def get_slot_name_mapping(dataset): + """ + Returns a dict which maps slot names to entities + """ + slot_name_mapping = dict() + for intent_name, intent in dataset[INTENTS].iteritems(): + _dict = dict() + slot_name_mapping[intent_name] = _dict + for utterance in intent[UTTERANCES]: + for chunk in utterance[DATA]: + if SLOT_NAME in chunk: + _dict[chunk[SLOT_NAME]] = chunk[ENTITY] + return slot_name_mapping def get_intent_custom_entities(dataset, intent): @@ -105,13 +188,25 @@ def snips_nlu_entities(dataset): class SnipsNLUEngine(NLUEngine): - def __init__(self, language, builtin_parser=None, custom_parsers=None, - entities=None): + def __init__(self, language, rule_based_parser=None, + probabilistic_parser=None, builtin_parser=None, entities=None, + slot_name_mapping=None, ui_builtin_parsing_threshold=None, + intents_data_sizes=None): super(SnipsNLUEngine, self).__init__(language) + self.rule_based_parser = rule_based_parser + self.probabilistic_parser = probabilistic_parser self._builtin_parser = None self.builtin_parser = builtin_parser - self.custom_parsers = custom_parsers + if entities is None: + entities = dict() self.entities = entities + if slot_name_mapping is None: + slot_name_mapping = dict() + self.slot_name_mapping = slot_name_mapping + if ui_builtin_parsing_threshold is None: + ui_builtin_parsing_threshold = 5 + self.ui_builtin_parsing_threshold = ui_builtin_parsing_threshold + self.intents_data_sizes = intents_data_sizes @property def builtin_parser(self): @@ -127,21 +222,89 @@ def builtin_parser(self, value): % (value.parser.language, self.language.iso_code)) self._builtin_parser = value - def parse(self, text): + def parse(self, text, intent=None, force_builtin_entities=False): """ Parse the input text and returns a dictionary containing the most likely intent and slots. + If the intent is provided, intent classification is not performed. + If the builtin entity parsing is enforced, then the intent must be + provided """ - if self.builtin_parser is None and self.custom_parsers is None: - raise ValueError("NLUEngine as no built-in parser nor " - "custom parsers") - parsers = [] - if self.custom_parsers is not None: - parsers += self.custom_parsers - if self.builtin_parser is not None: - parsers.append(self.builtin_parser) + if force_builtin_entities: + if intent is None: + raise ValueError("If builtin entities parsing if enforced, " + "intent should be passed") + return self._parse_and_force_builtin_entities( + text, intent).as_dict() + else: + return self._parse(text, intent=intent).as_dict() + + def _parse(self, text, intent=None): + result = _parse(text, self.entities, self.rule_based_parser, + self.probabilistic_parser, self.builtin_parser, + intent) + if result.is_empty(): + return result - return _parse(text, parsers, self.entities).as_dict() + result = self.augment_slots_with_builtin_entities(result) + return result + + def _parse_and_force_builtin_entities(self, text, intent): + """ + Parse the input text for UI auto tagging and returns a dictionary + containing the most likely slots. + """ + result = self._parse(text, intent=intent) + force_builtin_parsing = self.intents_data_sizes[intent] < \ + self.ui_builtin_parsing_threshold + if not force_builtin_parsing: + return result + + built_in_entities = get_built_in_entities(text, self.language) + if len(built_in_entities) == 0: + return result + + slots = result.parsed_slots + if slots is None: + slots = [ParsedSlot(e[MATCH_RANGE], e[VALUE], e[ENTITY].label, + e[ENTITY].label) for e in built_in_entities] + else: + for ent in built_in_entities: + if any(s.match_range[0] <= ent[MATCH_RANGE][1] + and s.match_range[1] >= ent[MATCH_RANGE][0] + for s in slots): + continue + parsed_slot = ParsedSlot(ent[MATCH_RANGE], ent[VALUE], + ent[ENTITY].label, ent[ENTITY].label) + slots.append(parsed_slot) + parsed_intent = IntentClassificationResult( + result.parsed_intent.intent_name, result.parsed_intent.probability) + return Result(text, parsed_intent=parsed_intent, parsed_slots=slots) + + def augment_slots_with_builtin_entities(self, result): + if self.probabilistic_parser is None: + return result + + intent_name = result.parsed_intent.intent_name + intent_slots_mapping = self.slot_name_mapping.get(intent_name, dict()) + all_intent_slots = intent_slots_mapping.keys() + builtin_slots = set(s for s in all_intent_slots + if intent_slots_mapping[s] in + BuiltInEntity.built_in_entity_by_label) + found_slots = set(s.slot_name for s in result.parsed_slots) + missing_builtin_slots = set(builtin_slots).difference(found_slots) + if len(missing_builtin_slots) == 0: + return result + + tagger = self.probabilistic_parser.crf_taggers[intent_name] + text = result.text + scope = [BuiltInEntity.from_label(intent_slots_mapping[slot]) + for slot in missing_builtin_slots] + builtin_entities = get_built_in_entities(text, self.language, scope) + slots = augment_slots(text, tagger, intent_slots_mapping, + builtin_entities, missing_builtin_slots) + return Result(text, parsed_intent=result.parsed_intent, + parsed_slots=slots) def fit(self, dataset): """ @@ -154,20 +317,23 @@ def fit(self, dataset): """ dataset = validate_and_format_dataset(dataset) custom_dataset = filter_dataset(dataset, CUSTOM_ENGINE) - custom_parser = RegexIntentParser().fit(dataset) + self.rule_based_parser = RegexIntentParser().fit(dataset) self.entities = snips_nlu_entities(dataset) + self.intents_data_sizes = {intent_name: len(intent[UTTERANCES]) + for intent_name, intent + in custom_dataset[INTENTS].iteritems()} + self.slot_name_mapping = get_slot_name_mapping(custom_dataset) taggers = dict() - for intent in custom_dataset[INTENTS].keys(): + for intent in custom_dataset[INTENTS]: intent_custom_entities = get_intent_custom_entities(custom_dataset, intent) - features = crf_features(intent_custom_entities, - language=self.language) + features = crf_features(intent_custom_entities, self.language) taggers[intent] = CRFTagger(default_crf_model(), features, TaggingScheme.BIO, self.language) intent_classifier = SnipsIntentClassifier(self.language) - crf_parser = CRFIntentParser(self.language, intent_classifier, taggers) - crf_parser = crf_parser.fit(dataset) - self.custom_parsers = [custom_parser, crf_parser] + self.probabilistic_parser = ProbabilisticIntentParser( + self.language, intent_classifier, taggers, self.slot_name_mapping) + self.probabilistic_parser.fit(dataset) return self def to_dict(self): @@ -180,11 +346,22 @@ def to_dict(self): if self.language is not None: language_code = self.language.iso_code + rule_based_parser_dict = None + probabilistic_parser_dict = None + if self.rule_based_parser is not None: + rule_based_parser_dict = self.rule_based_parser.to_dict() + if self.probabilistic_parser is not None: + probabilistic_parser_dict = self.probabilistic_parser.to_dict() + return { LANGUAGE: language_code, - CUSTOM_PARSERS: [p.to_dict() for p in self.custom_parsers], + "rule_based_parser": rule_based_parser_dict, + "probabilistic_parser": probabilistic_parser_dict, BUILTIN_PARSER: None, - ENTITIES: self.entities + "slot_name_mapping": self.slot_name_mapping, + "ui_builtin_parsing_threshold": self.ui_builtin_parsing_threshold, + ENTITIES: self.entities, + "intents_data_sizes": self.intents_data_sizes } @classmethod @@ -202,20 +379,37 @@ def load_from(cls, language, customs=None, builtin_path=None, if isinstance(language, (str, unicode)): language = Language.from_iso_code(language) - custom_parsers = None + rule_based_parser = None + probabilistic_parser = None + builtin_parser = None entities = None + ui_builtin_parsing_threshold = None + slot_name_mapping = None + intent_data_size = None + if customs is not None: - custom_parsers = [instance_from_dict(d) for d in - customs[CUSTOM_PARSERS]] + rule_based_parser = instance_from_dict( + customs["rule_based_parser"]) + probabilistic_parser = instance_from_dict( + customs["probabilistic_parser"]) entities = customs[ENTITIES] - builtin_parser = None + ui_builtin_parsing_threshold = customs[ + "ui_builtin_parsing_threshold"] + slot_name_mapping = customs["slot_name_mapping"] + intent_data_size = customs["intents_data_sizes"] + if builtin_path is not None or builtin_binary is not None: builtin_parser = BuiltinIntentParser(language=language, data_path=builtin_path, data_binary=builtin_binary) - return cls(language, builtin_parser=builtin_parser, - custom_parsers=custom_parsers, entities=entities) + return cls(language, rule_based_parser=rule_based_parser, + probabilistic_parser=probabilistic_parser, + builtin_parser=builtin_parser, + slot_name_mapping=slot_name_mapping, + entities=entities, + ui_builtin_parsing_threshold=ui_builtin_parsing_threshold, + intents_data_sizes=intent_data_size) def __eq__(self, other): return isinstance(other, self.__class__) and \ @@ -223,18 +417,3 @@ def __eq__(self, other): def __ne__(self, other): return not self.__eq__(other) - - -class BuiltInEntitiesNLUEngine(NLUEngine): - def __init__(self, language): - super(BuiltInEntitiesNLUEngine, self).__init__(language) - - def parse(self, text): - built_in_entities = get_built_in_entities(text, self.language) - slots = None - if len(built_in_entities) > 0: - slots = [ - ParsedSlot(match_range=e[MATCH_RANGE], value=e[VALUE], - entity=e[ENTITY].label, slot_name=e[ENTITY].label) - for e in built_in_entities] - return Result(text, parsed_intent=None, parsed_slots=slots).as_dict() diff --git a/snips_nlu/resources.py b/snips_nlu/resources.py new file mode 100644 index 000000000..be682a37f --- /dev/null +++ b/snips_nlu/resources.py @@ -0,0 +1,134 @@ +from __future__ import unicode_literals + +import io +import os + +from snips_nlu.constants import (STOP_WORDS, SUBTITLES, + WORD_CLUSTERS, GAZETTEERS) +from snips_nlu.languages import Language +from snips_nlu.tokenization import tokenize +from snips_nlu.utils import get_resources_path + +RESOURCE_INDEX = { + Language.EN: { + GAZETTEERS: ["top_10000_nouns.txt", "cities_us.txt", + "cities_world.txt", "countries.txt", "states_us.txt", + "stop_words.txt", "street_identifier.txt", + "top_10000_words.txt"], + STOP_WORDS: "stop_words.txt", + SUBTITLES: "subtitles.txt", + WORD_CLUSTERS: ["brown_clusters.txt"] + }, + Language.FR: { + STOP_WORDS: "stop_words.txt", + SUBTITLES: "subtitles.txt", + }, + Language.ES: { + STOP_WORDS: "stop_words.txt", + SUBTITLES: "subtitles.txt", + }, + Language.KO: { + STOP_WORDS: "stop_words.txt", + SUBTITLES: "subtitles.txt", + }, + Language.DE: { + STOP_WORDS: "stop_words.txt", + SUBTITLES: "subtitles.txt", + } +} + +_STOP_WORDS = dict() +_SUBTITLES = dict() +_GAZETTEERS = dict() +_WORD_CLUSTERS = dict() +_GAZETTEERS_REGEXES = dict() + + +def load_stop_words(): + for language in Language: + if STOP_WORDS in RESOURCE_INDEX[language]: + stop_words_file_path = os.path.join( + get_resources_path(language), + RESOURCE_INDEX[language][STOP_WORDS]) + with io.open(stop_words_file_path, encoding='utf8') as f: + lines = [l.strip() for l in f] + _STOP_WORDS[language] = set(l for l in lines if len(l) > 0) + + +def get_stop_words(language): + return _STOP_WORDS[language] + + +def load_subtitles(): + for language in Language: + if SUBTITLES in RESOURCE_INDEX[language]: + subtitles_file_path = os.path.join( + get_resources_path(language), + RESOURCE_INDEX[language][SUBTITLES]) + with io.open(subtitles_file_path, encoding='utf8') as f: + lines = [l.strip() for l in f] + _SUBTITLES[language] = set(l for l in lines if len(l) > 0) + + +def get_subtitles(language): + return _SUBTITLES[language] + + +def load_clusters(): + for language in Language: + word_clusters_paths = { + os.path.splitext(name)[0]: os.path.join( + get_resources_path(language), name) + for name in RESOURCE_INDEX[language].get(WORD_CLUSTERS, []) + } + if WORD_CLUSTERS in RESOURCE_INDEX[language]: + _word_clusters = dict() + _WORD_CLUSTERS[language] = _word_clusters + for name, path in word_clusters_paths.iteritems(): + with io.open(path, encoding="utf8") as f: + _word_clusters[name] = dict() + for l in f: + split = l.rstrip().lower().split("\t") + normalized = " ".join( + [t.value for t in tokenize(split[0])]) + if len(split) == 2: + _word_clusters[name][normalized] = split[1] + + +def get_word_clusters(language): + return _WORD_CLUSTERS[language] + + +def load_gazetteers(): + for language in Language: + gazetteers_paths = { + os.path.splitext(name)[0]: os.path.join( + get_resources_path(language), name) + for name in RESOURCE_INDEX[language].get(GAZETTEERS, []) + } + _gazetteers = dict() + _GAZETTEERS[language] = _gazetteers + for name, path in gazetteers_paths.iteritems(): + with io.open(path, encoding="utf8") as f: + _gazetteers[name] = set() + for l in f: + normalized = l.strip().lower() + if len(normalized) > 0: + normalized = " ".join( + [t.value for t in tokenize(normalized)]) + _gazetteers[name].add(normalized) + + +def get_gazetteers(language): + return _GAZETTEERS[language] + + +def get_gazetteer(language, gazetteer_name): + return get_gazetteers(language)[gazetteer_name] + + +def load_resources(): + load_clusters() + load_gazetteers() + load_stop_words() + load_subtitles() diff --git a/snips_nlu/result.py b/snips_nlu/result.py index f6f3ad6f9..57e911b69 100644 --- a/snips_nlu/result.py +++ b/snips_nlu/result.py @@ -45,7 +45,7 @@ def as_dict(self): parsed_intent = None if self.parsed_slots is not None: parsed_slots = map(lambda slot: slot.as_dict(), - self.parsed_slots) + self.parsed_slots) else: parsed_slots = None return { @@ -53,3 +53,10 @@ def as_dict(self): PARSED_INTENT: parsed_intent, PARSED_SLOTS: parsed_slots } + + def is_empty(self): + return self.parsed_intent is None and self.parsed_slots is None + + +def empty_result(text): + return Result(text=text, parsed_intent=None, parsed_slots=None) diff --git a/snips_nlu/slot_filler/crf_resources.py b/snips_nlu/slot_filler/crf_resources.py deleted file mode 100644 index cffb1177b..000000000 --- a/snips_nlu/slot_filler/crf_resources.py +++ /dev/null @@ -1,91 +0,0 @@ -import io -import os -import re - -from snips_nlu.languages import Language -from snips_nlu.tokenization import tokenize -from snips_nlu.utils import get_resources_path - -CLUSTER_NAMES = { - Language.EN: ["brown_clusters"] -} - -WORD_CLUSTERS = dict() - - -def get_word_clusters(language): - global WORD_CLUSTERS - word_clusters_paths = { - name: os.path.join(get_resources_path(language), "%s.txt" % name) - for name in CLUSTER_NAMES.get(language, []) - } - if language not in WORD_CLUSTERS: - _word_clusters = dict() - WORD_CLUSTERS[language] = _word_clusters - for name, path in word_clusters_paths.iteritems(): - with io.open(path, encoding="utf8") as f: - _word_clusters[name] = dict() - for l in f: - split = l.rstrip().lower().split("\t") - normalized = " ".join( - [t.value for t in tokenize(split[0])]) - if len(split) == 2: - _word_clusters[name][normalized] = split[1] - - return WORD_CLUSTERS[language] - - -GAZETTEERS_NAMES = { - Language.EN: ["top_10000_nouns", "cities_us", "cities_world", - "countries", "states_us", "stop_words", - "street_identifier", "top_10000_words"] -} - -GAZETTEERS = dict() - - -def get_gazetteers(language): - global GAZETTEERS - gazetteers_paths = { - name: os.path.join(get_resources_path(language), "%s.txt" % name) - for name in GAZETTEERS_NAMES.get(language, []) - } - if language not in GAZETTEERS: - _gazetteers = dict() - GAZETTEERS[language] = _gazetteers - for name, path in gazetteers_paths.iteritems(): - with io.open(path, encoding="utf8") as f: - _gazetteers[name] = set() - for l in f: - normalized = l.strip().lower() - if len(normalized) > 0: - normalized = " ".join( - [t.value for t in tokenize(normalized)]) - _gazetteers[name].add(normalized) - - return GAZETTEERS[language] - - -def get_gazetteer(language, gazetteer_name): - return get_gazetteers(language)[gazetteer_name] - - -GAZETTEERS_REGEXES = dict() - - -def get_gazetteers_regexes(language): - global GAZETTEERS_REGEXES - if language not in GAZETTEERS_REGEXES: - gazetteers = get_gazetteers(language) - _gazetteers_regexes = dict() - GAZETTEERS_REGEXES[language] = _gazetteers_regexes - for name, expression_set in gazetteers.iteritems(): - pattern = r"|".join(re.escape(e) for e in - sorted(expression_set, key=len, reverse=True)) - regex = re.compile(pattern, re.IGNORECASE) - _gazetteers_regexes[name] = regex - return GAZETTEERS_REGEXES[language] - - -def get_gazetteer_regex(language, gazetteer_name): - return get_gazetteers_regexes(language)[gazetteer_name] diff --git a/snips_nlu/slot_filler/crf_tagger.py b/snips_nlu/slot_filler/crf_tagger.py index cbf9b2356..c4a96d1da 100644 --- a/snips_nlu/slot_filler/crf_tagger.py +++ b/snips_nlu/slot_filler/crf_tagger.py @@ -69,6 +69,11 @@ def get_tags(self, tokens): features = self.compute_features(tokens) return self.crf_model.predict_single(features) + def get_sequence_probability(self, tokens, labels): + features = self.compute_features(tokens) + self.crf_model.tagger_.set(features) + return self.crf_model.tagger_.probability(labels) + def fit(self, data, verbose=False): X = [self.compute_features(sample[TOKENS]) for sample in data] Y = [sample[TAGS] for sample in data] diff --git a/snips_nlu/slot_filler/crf_utils.py b/snips_nlu/slot_filler/crf_utils.py index 8ef565aef..aadf3d3b9 100644 --- a/snips_nlu/slot_filler/crf_utils.py +++ b/snips_nlu/slot_filler/crf_utils.py @@ -1,6 +1,7 @@ from enum import Enum, unique from snips_nlu.constants import TEXT, SLOT_NAME +from snips_nlu.result import ParsedSlot from snips_nlu.tokenization import tokenize, Token BEGINNING_PREFIX = u'B-' @@ -114,16 +115,24 @@ def _tags_to_slots(tags, tokens, is_start_of_slot, is_end_of_slot): return slots -def tags_to_slots(tokens, tags, tagging_scheme): +def tags_to_slots(text, tokens, tags, tagging_scheme, intent_slots_mapping): if tagging_scheme == TaggingScheme.IO: - return _tags_to_slots(tags, tokens, start_of_io_slot, end_of_io_slot) + slots = _tags_to_slots(tags, tokens, start_of_io_slot, end_of_io_slot) elif tagging_scheme == TaggingScheme.BIO: - return _tags_to_slots(tags, tokens, start_of_bio_slot, end_of_bio_slot) + slots = _tags_to_slots(tags, tokens, start_of_bio_slot, + end_of_bio_slot) elif tagging_scheme == TaggingScheme.BILOU: - return _tags_to_slots(tags, tokens, start_of_bilou_slot, - end_of_bilou_slot) + slots = _tags_to_slots(tags, tokens, start_of_bilou_slot, + end_of_bilou_slot) else: raise ValueError("Unknown tagging scheme %s" % tagging_scheme) + return [ + ParsedSlot(match_range=slot["range"], + value=text[slot["range"][0]:slot["range"][1]], + entity=intent_slots_mapping[slot[SLOT_NAME]], + slot_name=slot[SLOT_NAME]) + for slot in slots + ] def positive_tagging(tagging_scheme, slot_name, slot_size): @@ -188,4 +197,4 @@ def get_scheme_prefix(index, indexes, tagging_scheme): else: return INSIDE_PREFIX else: - raise ValueError("Invalid tagging scheme %s" % tagging_scheme) \ No newline at end of file + raise ValueError("Invalid tagging scheme %s" % tagging_scheme) diff --git a/snips_nlu/slot_filler/data_augmentation.py b/snips_nlu/slot_filler/data_augmentation.py index 35c15c71d..10c58d782 100644 --- a/snips_nlu/slot_filler/data_augmentation.py +++ b/snips_nlu/slot_filler/data_augmentation.py @@ -6,8 +6,7 @@ from snips_nlu.constants import (UTTERANCES, DATA, ENTITY, USE_SYNONYMS, SYNONYMS, VALUE, TEXT, INTENTS, ENTITIES) -from snips_nlu.intent_classifier.intent_classifier_resources import \ - get_subtitles +from snips_nlu.resources import get_subtitles from snips_nlu.tokenization import tokenize diff --git a/snips_nlu/slot_filler/feature_functions.py b/snips_nlu/slot_filler/feature_functions.py index e7915a873..09c38c915 100644 --- a/snips_nlu/slot_filler/feature_functions.py +++ b/snips_nlu/slot_filler/feature_functions.py @@ -1,6 +1,6 @@ from collections import namedtuple -from crf_resources import get_word_clusters, get_gazetteer +from snips_nlu.resources import get_word_clusters, get_gazetteer from snips_nlu.built_in_entities import get_built_in_entities, BuiltInEntity from snips_nlu.constants import (MATCH_RANGE, TOKEN_INDEXES, NGRAM) from snips_nlu.languages import Language diff --git a/snips_nlu/tests/test_built_in_entities.py b/snips_nlu/tests/test_built_in_entities.py index dab009ddc..6e4100ace 100644 --- a/snips_nlu/tests/test_built_in_entities.py +++ b/snips_nlu/tests/test_built_in_entities.py @@ -16,23 +16,22 @@ def test_get_built_in_entities(self, mocked_duckling_parse): language = Language.from_iso_code(language) text = "let's meet at 2p.m in the bronx" - def mocked_parse(module, text): - return [{ - 'body': u'at 2p.m.', - 'dim': 'time', - 'end': 17, - 'value': { - 'values': [{'grain': 'hour', 'type': 'value', - 'value': '2017-03-29 14:00:00'}, - {'grain': 'hour', 'type': 'value', - 'value': '2017-03-30 14:00:00'}, - {'grain': 'hour', 'type': 'value', - 'value': '2017-03-31 14:00:00'}], - 'type': 'value', - 'grain': 'hour', 'value': '2017-03-27 14:00:00'}, - 'start': 9}] - - mocked_duckling_parse.side_effect = mocked_parse + mocked_parse = [{ + 'body': u'at 2p.m.', + 'dim': 'time', + 'end': 17, + 'value': { + 'values': [{'grain': 'hour', 'type': 'value', + 'value': '2017-03-29 14:00:00'}, + {'grain': 'hour', 'type': 'value', + 'value': '2017-03-30 14:00:00'}, + {'grain': 'hour', 'type': 'value', + 'value': '2017-03-31 14:00:00'}], + 'type': 'value', + 'grain': 'hour', 'value': '2017-03-27 14:00:00'}, + 'start': 9}] + + mocked_duckling_parse.return_value = mocked_parse expected_entities = [{MATCH_RANGE: (9, 17), VALUE: u'at 2p.m.', ENTITY: BuiltInEntity.DATETIME}] @@ -53,6 +52,7 @@ def test_scope_to_dims(self): def test_built_in_label_uniqueness(self): # Given + # noinspection PyTypeChecker labels = [ent.value["label"] for ent in BuiltInEntity] # When @@ -63,6 +63,7 @@ def test_built_in_label_uniqueness(self): def test_built_in_label_duckling_dim_mapping(self): # Given + # noinspection PyTypeChecker duckling_names = [ent.value["duckling_dim"] for ent in BuiltInEntity] # When @@ -78,11 +79,7 @@ def test_duckling_cache(self, mocked_duckling_parse): language = "en" language = Language.from_iso_code(language) text = "input text used twice" - - def mocked_parse(module, text): - return [] - - mocked_duckling_parse.side_effect = mocked_parse + mocked_duckling_parse.return_value = [] # When get_built_in_entities(text, language) diff --git a/snips_nlu/tests/test_crf_tagger.py b/snips_nlu/tests/test_crf_tagger.py index b22491951..4b1aa810b 100644 --- a/snips_nlu/tests/test_crf_tagger.py +++ b/snips_nlu/tests/test_crf_tagger.py @@ -35,6 +35,7 @@ def test_should_be_serializable(self): tagger_dict = tagger.to_dict() # Then + # noinspection PyBroadException try: json.dumps(tagger_dict).decode("utf8") except: @@ -86,6 +87,7 @@ def test_should_be_deserializable(self): language=Language.EN) tagger_dict = tagger.to_dict() tagger_json = json.dumps(tagger_dict).decode("utf8") + # noinspection PyBroadException try: _ = CRFTagger.from_dict(json.loads(tagger_json)) except: diff --git a/snips_nlu/tests/test_crf_utils.py b/snips_nlu/tests/test_crf_utils.py index e6d05ba3a..7cd64d527 100644 --- a/snips_nlu/tests/test_crf_utils.py +++ b/snips_nlu/tests/test_crf_utils.py @@ -2,12 +2,12 @@ from mock import patch -from snips_nlu.constants import SLOT_NAME +from snips_nlu.result import ParsedSlot from snips_nlu.slot_filler.crf_utils import ( OUTSIDE, BEGINNING_PREFIX, LAST_PREFIX, UNIT_PREFIX, INSIDE_PREFIX, utterance_to_sample, TaggingScheme, negative_tagging, - positive_tagging, end_of_bio_slot, start_of_bio_slot, RANGE, - start_of_bilou_slot, end_of_bilou_slot, tags_to_slots) + positive_tagging, end_of_bio_slot, start_of_bio_slot, start_of_bilou_slot, + end_of_bilou_slot, tags_to_slots) from snips_nlu.tokenization import tokenize, Token @@ -15,6 +15,7 @@ class TestCRFUtils(unittest.TestCase): def test_io_tags_to_slots(self): # Given slot_name = "animal" + intent_slots_mapping = {"animal": "animal"} tags = [ { "text": "", @@ -32,10 +33,12 @@ def test_io_tags_to_slots(self): INSIDE_PREFIX + slot_name, INSIDE_PREFIX + slot_name], "expected_slots": [ - { - "range": (7, 16), - "slot_name": slot_name - } + ParsedSlot( + match_range=(7, 16), + value="blue bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -43,20 +46,24 @@ def test_io_tags_to_slots(self): "tags": [OUTSIDE, OUTSIDE, OUTSIDE, INSIDE_PREFIX + slot_name], "expected_slots": [ - { - "range": (7, 11), - "slot_name": slot_name - } + ParsedSlot( + match_range=(7, 11), + value="bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { "text": "bird", "tags": [INSIDE_PREFIX + slot_name], "expected_slots": [ - { - "range": (0, 4), - "slot_name": slot_name - } + ParsedSlot( + match_range=(0, 4), + value="bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -64,10 +71,12 @@ def test_io_tags_to_slots(self): "tags": [INSIDE_PREFIX + slot_name, INSIDE_PREFIX + slot_name], "expected_slots": [ - { - "range": (0, 9), - "slot_name": slot_name - } + ParsedSlot( + match_range=(0, 9), + value="blue bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -78,10 +87,12 @@ def test_io_tags_to_slots(self): INSIDE_PREFIX + slot_name, INSIDE_PREFIX + slot_name], "expected_slots": [ - { - "range": (0, 25), - "slot_name": slot_name - } + ParsedSlot( + match_range=(0, 25), + value="light blue bird blue bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -89,10 +100,12 @@ def test_io_tags_to_slots(self): "tags": [INSIDE_PREFIX + slot_name, INSIDE_PREFIX + slot_name], "expected_slots": [ - { - "range": (0, 10), - "slot_name": slot_name - } + ParsedSlot( + match_range=(0, 10), + value="bird birdy", + entity=slot_name, + slot_name=slot_name + ) ] } @@ -100,14 +113,16 @@ def test_io_tags_to_slots(self): for data in tags: # When - slots = tags_to_slots(tokenize(data["text"]), data["tags"], - TaggingScheme.IO) + slots = tags_to_slots(data["text"], tokenize(data["text"]), + data["tags"], TaggingScheme.IO, + intent_slots_mapping) # Then self.assertEqual(slots, data["expected_slots"]) def test_bio_tags_to_slots(self): # Given slot_name = "animal" + intent_slots_mapping = {"animal": "animal"} tags = [ { "text": "", @@ -125,10 +140,12 @@ def test_bio_tags_to_slots(self): BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (7, 16), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(7, 16), + value="blue bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -136,20 +153,24 @@ def test_bio_tags_to_slots(self): "tags": [OUTSIDE, OUTSIDE, OUTSIDE, BEGINNING_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (7, 11), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(7, 11), + value="bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { "text": "bird", "tags": [BEGINNING_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 4), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 4), + value="bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -157,10 +178,12 @@ def test_bio_tags_to_slots(self): "tags": [BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 9), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 9), + value="blue bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -171,14 +194,18 @@ def test_bio_tags_to_slots(self): BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 15), - SLOT_NAME: slot_name - }, - { - RANGE: (16, 25), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 15), + value="light blue bird", + entity=slot_name, + slot_name=slot_name + ), + ParsedSlot( + match_range=(16, 25), + value="blue bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -186,14 +213,18 @@ def test_bio_tags_to_slots(self): "tags": [BEGINNING_PREFIX + slot_name, BEGINNING_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 4), - SLOT_NAME: slot_name - }, - { - RANGE: (5, 10), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 4), + value="bird", + entity=slot_name, + slot_name=slot_name + ), + ParsedSlot( + match_range=(5, 10), + value="birdy", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -204,28 +235,34 @@ def test_bio_tags_to_slots(self): INSIDE_PREFIX + slot_name, INSIDE_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 9), - SLOT_NAME: slot_name - }, - { - RANGE: (14, 24), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 9), + value="blue bird", + entity=slot_name, + slot_name=slot_name + ), + ParsedSlot( + match_range=(14, 24), + value="white bird", + entity=slot_name, + slot_name=slot_name + ) ] } ] for data in tags: # When - slots = tags_to_slots(tokenize(data["text"]), data["tags"], - TaggingScheme.BIO) + slots = tags_to_slots(data["text"], tokenize(data["text"]), + data["tags"], TaggingScheme.BIO, + intent_slots_mapping) # Then self.assertEqual(slots, data["expected_slots"]) def test_bilou_tags_to_slots(self): # Given slot_name = "animal" + intent_slots_mapping = {"animal": "animal"} tags = [ { "text": "", @@ -243,10 +280,12 @@ def test_bilou_tags_to_slots(self): BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (7, 16), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(7, 16), + value="blue bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -254,20 +293,24 @@ def test_bilou_tags_to_slots(self): "tags": [OUTSIDE, OUTSIDE, OUTSIDE, UNIT_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (7, 11), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(7, 11), + value="bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { "text": "bird", "tags": [UNIT_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 4), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 4), + value="bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -275,10 +318,12 @@ def test_bilou_tags_to_slots(self): "tags": [BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 9), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 9), + value="blue bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -289,14 +334,18 @@ def test_bilou_tags_to_slots(self): BEGINNING_PREFIX + slot_name, LAST_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 15), - SLOT_NAME: slot_name - }, - { - RANGE: (16, 25), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 15), + value="light blue bird", + entity=slot_name, + slot_name=slot_name + ), + ParsedSlot( + match_range=(16, 25), + value="blue bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -304,14 +353,18 @@ def test_bilou_tags_to_slots(self): "tags": [UNIT_PREFIX + slot_name, UNIT_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 4), - SLOT_NAME: slot_name - }, - { - RANGE: (5, 10), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 4), + value="bird", + entity=slot_name, + slot_name=slot_name + ), + ParsedSlot( + match_range=(5, 10), + value="birdy", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -322,18 +375,24 @@ def test_bilou_tags_to_slots(self): BEGINNING_PREFIX + slot_name, INSIDE_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 10), - SLOT_NAME: slot_name - }, - { - RANGE: (11, 15), - SLOT_NAME: slot_name - }, - { - RANGE: (16, 25), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 10), + value="light bird", + entity=slot_name, + slot_name=slot_name + ), + ParsedSlot( + match_range=(11, 15), + value="bird", + entity=slot_name, + slot_name=slot_name + ), + ParsedSlot( + match_range=(16, 25), + value="blue bird", + entity=slot_name, + slot_name=slot_name + ) ] }, { @@ -342,26 +401,33 @@ def test_bilou_tags_to_slots(self): BEGINNING_PREFIX + slot_name, UNIT_PREFIX + slot_name], "expected_slots": [ - { - RANGE: (0, 4), - SLOT_NAME: slot_name - }, - { - RANGE: (5, 9), - SLOT_NAME: slot_name - }, - { - "range": (10, 14), - SLOT_NAME: slot_name - } + ParsedSlot( + match_range=(0, 4), + value="bird", + entity=slot_name, + slot_name=slot_name + ), + ParsedSlot( + match_range=(5, 9), + value="bird", + entity=slot_name, + slot_name=slot_name + ), + ParsedSlot( + match_range=(10, 14), + value="bird", + entity=slot_name, + slot_name=slot_name + ) ] }, ] for data in tags: # When - slots = tags_to_slots(tokenize(data["text"]), data["tags"], - TaggingScheme.BILOU) + slots = tags_to_slots(data["text"], tokenize(data["text"]), + data["tags"], TaggingScheme.BILOU, + intent_slots_mapping) # Then self.assertEqual(slots, data["expected_slots"]) @@ -372,18 +438,21 @@ def test_positive_tagging_should_handle_zero_length(self): # When tags = [] + # noinspection PyTypeChecker for scheme in TaggingScheme: tags.append(positive_tagging(scheme, slot_name, slot_size)) # Then + # noinspection PyTypeChecker expected_tags = [[]] * len(TaggingScheme) self.assertEqual(tags, expected_tags) @patch('snips_nlu.slot_filler.crf_utils.positive_tagging') def test_utterance_to_sample(self, mocked_positive_tagging): # Given - def mock_positive_tagging(tagging_scheme, slot_name, slot_size): - return [INSIDE_PREFIX + slot_name for _ in xrange(slot_size)] + # noinspection PyUnusedLocal + def mock_positive_tagging(tagging_scheme, slot, slot_size): + return [INSIDE_PREFIX + slot for _ in xrange(slot_size)] mocked_positive_tagging.side_effect = mock_positive_tagging slot_name = "animal" @@ -411,8 +480,9 @@ def test_utterance_to_sample_with_partial_slots(self, mocked_positive_tagging): # Given - def mock_positive_tagging(tagging_scheme, slot_name, slot_size): - return [INSIDE_PREFIX + slot_name for _ in xrange(slot_size)] + # noinspection PyUnusedLocal + def mock_positive_tagging(tagging_scheme, slot, slot_size): + return [INSIDE_PREFIX + slot for _ in xrange(slot_size)] mocked_positive_tagging.side_effect = mock_positive_tagging slot_name = "animal" diff --git a/snips_nlu/tests/test_feature_functions.py b/snips_nlu/tests/test_feature_functions.py index b4bdfe675..62ffa8dda 100644 --- a/snips_nlu/tests/test_feature_functions.py +++ b/snips_nlu/tests/test_feature_functions.py @@ -39,10 +39,9 @@ def test_ngrams(self): @patch('snips_nlu.slot_filler.feature_functions.get_gazetteer') def test_ngrams_with_rare_word(self, mocked_get_gazetteer): # Given - def mocked_gazetteer(language, gazetteer_name): - return {"i", "love", "music"} + mocked_gazetteer = {"i", "love", "music"} - mocked_get_gazetteer.side_effect = mocked_gazetteer + mocked_get_gazetteer.return_value = mocked_gazetteer tokens = tokenize("I love house music") ngrams = { 1: ["i", "love", "rare_word", "music"], @@ -128,18 +127,15 @@ def test_get_built_in_annotation_fn(self, mocked_get_built_in_entities): # Given input_text = u"i ll be there tomorrow at noon is that ok" - def mocked_built_in_entities(text, language, scope): - if text == input_text: - return [ - { - MATCH_RANGE: (14, 30), - VALUE: u"tomorrow at noon", - ENTITY: BuiltInEntity.DATETIME - } - ] - return [] + mocked_built_in_entities = [ + { + MATCH_RANGE: (14, 30), + VALUE: u"tomorrow at noon", + ENTITY: BuiltInEntity.DATETIME + } + ] - mocked_get_built_in_entities.side_effect = mocked_built_in_entities + mocked_get_built_in_entities.return_value = mocked_built_in_entities tokens = tokenize(input_text) feature_fn = get_built_in_annotation_fn(BuiltInEntity.DATETIME.label, Language.EN.iso_code, @@ -233,6 +229,7 @@ def test_crf_features(self): } # When + # noinspection PyUnresolvedReferences np.random.seed(42) keep_prob = 0.5 features_signatures = crf_features( @@ -240,6 +237,7 @@ def test_crf_features(self): language=Language.EN) # Then + # noinspection PyUnresolvedReferences np.random.seed(42) collection_1 = ['dummy_a', 'dummy_a_bis', 'dummy_b', 'dummy_b_bis'] collection_1_size = max(int(keep_prob * len(collection_1)), 1) diff --git a/snips_nlu/tests/test_intent_classifier_data_augmentation.py b/snips_nlu/tests/test_intent_classifier_data_augmentation.py index df5f72e9b..1af9ada55 100644 --- a/snips_nlu/tests/test_intent_classifier_data_augmentation.py +++ b/snips_nlu/tests/test_intent_classifier_data_augmentation.py @@ -56,20 +56,21 @@ def get_mocked_subtitles(_): avg_utterances = np.mean(nb_utterances) # When + # noinspection PyUnresolvedReferences np.random.seed(42) noise_factor = 2 - utterances, y, intent_mapping = build_training_data(custom_dataset, - builtin_dataset, - Language.EN, - use_stemming=False, - noise_factor=noise_factor) + utterances, y, intent_mapping = build_training_data( + custom_dataset, builtin_dataset, Language.EN, use_stemming=False, + noise_factor=noise_factor) # Then expected_utterances = [get_text_from_chunks(utterance[DATA]) for intent in dataset[INTENTS].values() for utterance in intent[UTTERANCES]] + # noinspection PyUnresolvedReferences np.random.seed(42) noise = list(get_mocked_subtitles(Language.EN)) + # noinspection PyTypeChecker noise_size = int(min(noise_factor * avg_utterances, len(noise))) noisy_utterances = np.random.choice(noise, size=noise_size, replace=False) diff --git a/snips_nlu/tests/test_intent_classifier_feature_extraction.py b/snips_nlu/tests/test_intent_classifier_feature_extraction.py index 16154c60b..64cc90f4b 100644 --- a/snips_nlu/tests/test_intent_classifier_feature_extraction.py +++ b/snips_nlu/tests/test_intent_classifier_feature_extraction.py @@ -22,11 +22,13 @@ def test_should_be_serializable(self): serialized_featurizer = featurizer.to_dict() # Then + # noinspection PyBroadException try: dumped = json.dumps(serialized_featurizer).decode("utf8") except: self.fail("Featurizer dict should be json serializable to utf8") + # noinspection PyBroadException try: _ = Featurizer.from_dict(json.loads(dumped)) except: diff --git a/snips_nlu/tests/test_languages.py b/snips_nlu/tests/test_languages.py index d17278176..1d48f2072 100644 --- a/snips_nlu/tests/test_languages.py +++ b/snips_nlu/tests/test_languages.py @@ -9,9 +9,11 @@ def test_iso_unique(self): language = Language # When + # noinspection PyTypeChecker nb_lang = len(set([lang.value['iso'] for lang in language])) # Then + # noinspection PyTypeChecker expected_nb_lang = len(Language) self.assertEqual(nb_lang, expected_nb_lang) diff --git a/snips_nlu/tests/test_nlu_engine.py b/snips_nlu/tests/test_nlu_engine.py index 59e3ca06f..d1b8e8c56 100644 --- a/snips_nlu/tests/test_nlu_engine.py +++ b/snips_nlu/tests/test_nlu_engine.py @@ -8,16 +8,11 @@ from snips_nlu.constants import ENGINE_TYPE, CUSTOM_ENGINE from snips_nlu.dataset import validate_and_format_dataset from snips_nlu.languages import Language -from snips_nlu.nlu_engine import SnipsNLUEngine, BuiltInEntitiesNLUEngine +from snips_nlu.nlu_engine import SnipsNLUEngine from snips_nlu.result import Result, ParsedSlot, IntentClassificationResult from utils import SAMPLE_DATASET, empty_dataset -def mocked_default(language, intent_entities, use_stemming, entities_offsets, - entity_keep_prob, common_words_gazetteer_name=None): - return [] - - class TestSnipsNLUEngine(unittest.TestCase): def test_should_use_parsers_sequentially(self): # Given @@ -59,16 +54,18 @@ def mock_get_slots(text, intent): mocked_entities = {"mocked_entity": {"automatically_extensible": True}} engine = SnipsNLUEngine( language, entities=mocked_entities, - custom_parsers=[mocked_parser1, mocked_parser2], - builtin_parser=mocked_builtin_parser) + rule_based_parser=mocked_parser1, + probabilistic_parser=mocked_parser2, + builtin_parser=mocked_builtin_parser, + slot_name_mapping={'mocked_slot_name': 'mocked_entity'}) # When parse = engine.parse(input_text) # Then - self.assertEqual(parse, - Result(input_text, intent_result2, - intent_entities2).as_dict()) + expected_parse = Result(input_text, intent_result2, + intent_entities2).as_dict() + self.assertEqual(parse, expected_parse) def test_should_parse_with_builtin_when_no_custom(self): # When @@ -110,7 +107,8 @@ def test_should_parse_with_builtin_when_customs_return_nothing(self): engine = SnipsNLUEngine( language, builtin_parser=mocked_builtin_parser, - custom_parsers=[mocked_parser1, mocked_parser2]) + rule_based_parser=mocked_parser1, + probabilistic_parser=mocked_parser2) # When text = "hello world" @@ -120,19 +118,6 @@ def test_should_parse_with_builtin_when_customs_return_nothing(self): self.assertEqual(parse, Result(text, builtin_intent_result, builtin_entities).as_dict()) - def test_should_raise_error_when_no_parsers(self): - # Given - language = Language.EN - engine = SnipsNLUEngine(language) - text = "hello world" - - # When/Then - with self.assertRaises(ValueError) as ctx: - engine.parse(text) - - self.assertEqual(ctx.exception.message, - "NLUEngine as no built-in parser nor custom parsers") - def test_should_handle_empty_dataset(self): # Given engine = SnipsNLUEngine(Language.EN).fit(empty_dataset(Language.EN)) @@ -157,11 +142,13 @@ def test_should_be_serializable(self): customs=serialized_engine) # Then + # noinspection PyBroadException try: dumped = json.dumps(serialized_engine).decode("utf8") except: self.fail("NLU engine dict should be json serializable to utf8") + # noinspection PyBroadException try: _ = SnipsNLUEngine.load_from(language=language.iso_code, customs=json.loads(dumped)) @@ -171,17 +158,19 @@ def test_should_be_serializable(self): self.assertEqual(deserialized_engine.parse(text), expected_parse) - @patch("snips_nlu.slot_filler.feature_functions.default_features", - side_effect=mocked_default) - @patch("snips_nlu.intent_parser.crf_intent_parser.CRFIntentParser" - ".get_slots") - @patch("snips_nlu.intent_parser.crf_intent_parser.CRFIntentParser" - ".get_intent") + @patch("snips_nlu.slot_filler.feature_functions.default_features") + @patch( + "snips_nlu.intent_parser.probabilistic_intent_parser" + ".ProbabilisticIntentParser.get_slots") + @patch( + "snips_nlu.intent_parser.probabilistic_intent_parser" + ".ProbabilisticIntentParser.get_intent") @patch("snips_nlu.intent_parser.regex_intent_parser.RegexIntentParser" ".get_intent") def test_should_handle_keyword_entities(self, mocked_regex_get_intent, mocked_crf_get_intent, - mocked_crf_get_slots, _): + mocked_crf_get_slots, + mocked_default_features): # Given language = Language.EN dataset = validate_and_format_dataset({ @@ -243,25 +232,20 @@ def test_should_handle_keyword_entities(self, mocked_regex_get_intent, "language": language.iso_code }) - def mocked_regex_intent(_): - return None - - def mocked_crf_intent(_): - return IntentClassificationResult("dummy_intent_1", 1.0) - - def mocked_crf_slots(_, intent=None): - return [ParsedSlot(match_range=(0, 7), - value="dummy_3", - entity="dummy_entity_1", - slot_name="dummy_slot_name"), - ParsedSlot(match_range=(8, 15), - value="dummy_4", - entity="dummy_entity_2", - slot_name="other_dummy_slot_name")] - - mocked_regex_get_intent.side_effect = mocked_regex_intent - mocked_crf_get_intent.side_effect = mocked_crf_intent - mocked_crf_get_slots.side_effect = mocked_crf_slots + mocked_default_features.return_value = [] + mocked_crf_intent = IntentClassificationResult("dummy_intent_1", 1.0) + mocked_crf_slots = [ParsedSlot(match_range=(0, 7), + value="dummy_3", + entity="dummy_entity_1", + slot_name="dummy_slot_name"), + ParsedSlot(match_range=(8, 15), + value="dummy_4", + entity="dummy_entity_2", + slot_name="other_dummy_slot_name")] + + mocked_regex_get_intent.return_value = None + mocked_crf_get_intent.return_value = mocked_crf_intent + mocked_crf_get_slots.return_value = mocked_crf_slots engine = SnipsNLUEngine(language) text = "dummy_3 dummy_4" @@ -272,24 +256,26 @@ def mocked_crf_slots(_, intent=None): # Then expected_result = Result( - text, parsed_intent=mocked_crf_intent(text), + text, parsed_intent=mocked_crf_intent, parsed_slots=[ParsedSlot(match_range=(8, 15), value="dummy_4", entity="dummy_entity_2", slot_name="other_dummy_slot_name")]) \ .as_dict() self.assertEqual(result, expected_result) - @patch("snips_nlu.slot_filler.feature_functions.default_features", - side_effect=mocked_default) - @patch("snips_nlu.intent_parser.crf_intent_parser.CRFIntentParser" - ".get_slots") - @patch("snips_nlu.intent_parser.crf_intent_parser.CRFIntentParser" - ".get_intent") + @patch("snips_nlu.slot_filler.feature_functions.default_features") + @patch( + "snips_nlu.intent_parser.probabilistic_intent_parser" + ".ProbabilisticIntentParser.get_slots") + @patch( + "snips_nlu.intent_parser.probabilistic_intent_parser" + ".ProbabilisticIntentParser.get_intent") @patch("snips_nlu.intent_parser.regex_intent_parser.RegexIntentParser" ".get_intent") def test_synonyms_should_point_to_base_value(self, mocked_regex_get_intent, mocked_crf_get_intent, - mocked_crf_get_slots, _): + mocked_crf_get_slots, + mocked_default_features): # Given language = Language.EN dataset = validate_and_format_dataset({ @@ -327,20 +313,15 @@ def test_synonyms_should_point_to_base_value(self, mocked_regex_get_intent, "language": language.iso_code }) - def mocked_regex_intent(_): - return None + mocked_default_features.return_value = [] + mocked_crf_intent = IntentClassificationResult("dummy_intent_1", 1.0) + mocked_crf_slots = [ParsedSlot(match_range=(0, 10), value="dummy1_bis", + entity="dummy_entity_1", + slot_name="dummy_slot_name")] - def mocked_crf_intent(_): - return IntentClassificationResult("dummy_intent_1", 1.0) - - def mocked_crf_slots(_, intent=None): - return [ParsedSlot(match_range=(0, 10), value="dummy1_bis", - entity="dummy_entity_1", - slot_name="dummy_slot_name")] - - mocked_regex_get_intent.side_effect = mocked_regex_intent - mocked_crf_get_intent.side_effect = mocked_crf_intent - mocked_crf_get_slots.side_effect = mocked_crf_slots + mocked_regex_get_intent.return_value = None + mocked_crf_get_intent.return_value = mocked_crf_intent + mocked_crf_get_slots.return_value = mocked_crf_slots engine = SnipsNLUEngine(language).fit(dataset) text = "dummy1_bis" @@ -350,55 +331,160 @@ def mocked_crf_slots(_, intent=None): # Then expected_result = Result( - text, parsed_intent=mocked_crf_intent(text), + text, parsed_intent=mocked_crf_intent, parsed_slots=[ParsedSlot(match_range=(0, 10), value="dummy1", entity="dummy_entity_1", slot_name="dummy_slot_name")]) \ .as_dict() self.assertEqual(result, expected_result) - def test_builtin_nlu_engine(self): + @patch("snips_nlu.slot_filler.feature_functions.default_features") + @patch("snips_nlu.intent_parser.regex_intent_parser" + ".RegexIntentParser.get_intent") + @patch("snips_nlu.intent_parser.probabilistic_intent_parser" + ".ProbabilisticIntentParser.get_intent") + def test_ui_parse_should_return_builtin( + self, mocked_probabilistic_get_intent, + mocked_regex_get_intent, mocked_default_features): # Given + mocked_default_features.return_value = [] + mocked_probabilistic_get_intent.return_value = None + mocked_regex_get_intent.return_value = None + language = Language.EN - texts = [ - "there is nothing here", - "Order me 2 book", - "can you join me at 2pm tomorrow?" - ] - engine = BuiltInEntitiesNLUEngine(language) + dataset = validate_and_format_dataset({ + "intents": { + "dummy_intent_1": { + ENGINE_TYPE: CUSTOM_ENGINE, + "utterances": [ + { + "data": [ + { + "text": "dummy 1", + "entity": "dummy_entity_1", + "slot_name": "dummy_slot_name" + } + ] + } + ] + } + }, + "entities": { + "dummy_entity_1": { + "use_synonyms": True, + "automatically_extensible": False, + "data": [ + { + "value": "dummy1", + "synonyms": [ + "dummy1", + "dummy1_bis" + ] + } + ] + } + }, + "language": language.iso_code + }) + engine = SnipsNLUEngine(language).fit(dataset) # When - results = [engine.parse(t) for t in texts] + text = "let's meet tomorrow at 3, what do you think?" + results = engine.parse(text, intent="dummy_intent_1", + force_builtin_entities=True) # Then - expected_results = [ - { - 'intent': None, - 'slots': None, - 'text': 'there is nothing here' + expected_results = { + 'intent': {'intent_name': 'dummy_intent_1', 'probability': 1.0}, + 'slots': [ + { + "range": [11, 24], + "value": "tomorrow at 3", + "slot_name": "snips/datetime" + } + ], + "text": text + } + + self.assertEqual(results, expected_results) + + @patch("snips_nlu.slot_filler.feature_functions.default_features") + @patch("snips_nlu.intent_parser.regex_intent_parser" + ".RegexIntentParser.get_intent") + @patch("snips_nlu.intent_parser.regex_intent_parser" + ".RegexIntentParser.get_slots") + @patch("snips_nlu.intent_parser.probabilistic_intent_parser" + ".ProbabilisticIntentParser.get_intent") + def test_parse_with_builtin_force_should_return_custom_when_overlapping( + self, mocked_probabilistic_get_intent, mocked_regex_get_slots, + mocked_regex_get_intent, mocked_default_features): + + # Given + intent_name = "dummy_intent_1" + text = "let's meet tomorrow at 3, what do you think?" + mocked_default_features.return_value = [] + mocked_probabilistic_get_intent.return_value = None + mocked_regex_get_intent.return_value = IntentClassificationResult( + intent_name=intent_name, probability=1.0) + range = [11, 24] + value = "tomorrow at 3" + entity = "my_datetime" + slot_name = "my_datetime" + mocked_regex_get_slots.return_value = [ParsedSlot( + range, value, entity, slot_name)] + + language = Language.EN + dataset = validate_and_format_dataset({ + "intents": { + intent_name: { + ENGINE_TYPE: CUSTOM_ENGINE, + "utterances": [ + { + "data": [ + { + "text": "dummy 1", + "entity": "dummy_entity_1", + "slot_name": "dummy_slot_name" + } + ] + } + ] + } }, - { - 'intent': None, - 'slots': [ - { - 'range': [9, 10], - 'slot_name': 'snips/number', - 'value': '2' - } - ], - 'text': 'Order me 2 book' + "entities": { + "dummy_entity_1": { + "use_synonyms": True, + "automatically_extensible": False, + "data": [ + { + "value": "dummy1", + "synonyms": [ + "dummy1", + "dummy1_bis" + ] + } + ] + } }, - { - 'intent': None, - 'slots': [ - { - 'range': [16, 31], - 'slot_name': 'snips/datetime', - 'value': 'at 2pm tomorrow' - } - ], - 'text': 'can you join me at 2pm tomorrow?' - } - ] + "language": language.iso_code + }) + engine = SnipsNLUEngine(language).fit(dataset) + + # When + results = engine.parse(text, intent=intent_name, + force_builtin_entities=True) + + # Then + expected_results = { + 'intent': {'intent_name': 'dummy_intent_1', 'probability': 1.0}, + 'slots': [ + { + "range": [11, 24], + "value": "tomorrow at 3", + "slot_name": slot_name + } + ], + "text": text + } self.assertEqual(results, expected_results) diff --git a/snips_nlu/tests/test_nlu_engine_utils.py b/snips_nlu/tests/test_nlu_engine_utils.py new file mode 100644 index 000000000..ef7ed727e --- /dev/null +++ b/snips_nlu/tests/test_nlu_engine_utils.py @@ -0,0 +1,99 @@ +from __future__ import unicode_literals + +import unittest + +from mock import MagicMock + +from snips_nlu.built_in_entities import BuiltInEntity +from snips_nlu.constants import MATCH_RANGE, VALUE, ENTITY +from snips_nlu.nlu_engine import augment_slots, spans_to_tokens_indexes +from snips_nlu.result import ParsedSlot +from snips_nlu.slot_filler.crf_utils import TaggingScheme, BEGINNING_PREFIX, \ + INSIDE_PREFIX +from snips_nlu.tokenization import Token + + +class TestNLUEngineUtils(unittest.TestCase): + def test_spans_to_tokens_indexes(self): + # Given + spans = [ + (0, 1), + (2, 6), + (5, 6), + (9, 15) + ] + tokens = [ + Token(value="abc", start=0, end=3, stem="abc"), + Token(value="def", start=4, end=7, stem="def"), + Token(value="ghi", start=10, end=13, stem="ghi") + ] + + # When + indexes = spans_to_tokens_indexes(spans, tokens) + + # Then + expected_indexes = [[0], [0, 1], [1], [2]] + self.assertListEqual(indexes, expected_indexes) + + def test_augment_slots(self): + # Given + text = "Find me a flight before 10pm and after 8pm" + intent_slots_mapping = { + "start_date": "snips/datetime", + "end_date": "snips/datetime", + } + missing_slots = {"start_date", "end_date"} + builtin_entities = [ + { + MATCH_RANGE: (16, 28), + VALUE: " before 10pm", + ENTITY: BuiltInEntity.DATETIME + }, + { + MATCH_RANGE: (33, 42), + VALUE: "after 8pm", + ENTITY: BuiltInEntity.DATETIME + } + ] + + def mocked_get_tags(tokens): + return ['O' for _ in tokens] + + def mocked_sequence_probability(tokens, tags): + first_tags = ['O' for _ in tokens] + first_tags[4] = '%sstart_date' % BEGINNING_PREFIX + first_tags[5] = '%sstart_date' % INSIDE_PREFIX + first_tags[7] = '%send_date' % BEGINNING_PREFIX + first_tags[8] = '%send_date' % INSIDE_PREFIX + + second_tags = ['O' for _ in tokens] + second_tags[4] = '%send_date' % BEGINNING_PREFIX + second_tags[5] = '%send_date' % INSIDE_PREFIX + second_tags[7] = '%sstart_date' % BEGINNING_PREFIX + second_tags[8] = '%sstart_date' % INSIDE_PREFIX + + if tags == first_tags: + return 0.6 + if tags == second_tags: + return 0.8 + else: + raise ValueError("Unexpected tag sequence: %s" % tags) + + tagger = MagicMock() + tagger.get_tags = MagicMock(side_effect=mocked_get_tags) + tagger.get_sequence_probability = MagicMock( + side_effect=mocked_sequence_probability) + tagger.tagging_scheme = TaggingScheme.BIO + + # When + augmented_slots = augment_slots(text, tagger, intent_slots_mapping, + builtin_entities, missing_slots) + + # Then + expected_slots = [ + ParsedSlot(value='before 10pm', match_range=(17, 28), + entity='snips/datetime', slot_name='end_date'), + ParsedSlot(value='after 8pm', match_range=(33, 42), + entity='snips/datetime', slot_name='start_date') + ] + self.assertListEqual(augmented_slots, expected_slots) diff --git a/snips_nlu/tests/test_regex_intent_parser.py b/snips_nlu/tests/test_regex_intent_parser.py index edc830fb9..8486e14f0 100644 --- a/snips_nlu/tests/test_regex_intent_parser.py +++ b/snips_nlu/tests/test_regex_intent_parser.py @@ -68,6 +68,7 @@ def test_should_be_serializable(self): parser_dict = parser.to_dict() # Then + # noinspection PyBroadException try: json.dumps(parser_dict).decode("utf-8") except: @@ -136,6 +137,7 @@ def test_should_be_deserializable(self): slot_names_to_entities=slot_names_to_entities ) + # noinspection PyBroadException try: parser_json = json.dumps(parser_dict).decode("utf-8") _ = RegexIntentParser.from_dict(json.loads(parser_json)) diff --git a/snips_nlu/tests/test_resources.py b/snips_nlu/tests/test_resources.py new file mode 100644 index 000000000..7d61a927a --- /dev/null +++ b/snips_nlu/tests/test_resources.py @@ -0,0 +1,16 @@ +import unittest + +from snips_nlu.languages import Language +from snips_nlu.resources import RESOURCE_INDEX + + +class TestResources(unittest.TestCase): + def test_resources_index_should_have_all_languages(self): + # Given + index = RESOURCE_INDEX + + # When + languages = index.keys() + + # Then + self.assertEqual(len(languages), len(Language.__members__)) diff --git a/snips_nlu/tests/test_result.py b/snips_nlu/tests/test_result.py index 2a15f8f27..eab4cda7b 100644 --- a/snips_nlu/tests/test_result.py +++ b/snips_nlu/tests/test_result.py @@ -1,8 +1,9 @@ import json import unittest -from snips_nlu.constants import (PARSED_INTENT, PARSED_SLOTS, TEXT, INTENT_NAME, - PROBABILITY, MATCH_RANGE, SLOT_NAME, VALUE) +from snips_nlu.constants import (PARSED_INTENT, PARSED_SLOTS, TEXT, + INTENT_NAME, PROBABILITY, MATCH_RANGE, + SLOT_NAME, VALUE) from snips_nlu.result import Result, IntentClassificationResult, ParsedSlot @@ -18,6 +19,7 @@ def test_should_serialize_results(self): result_dict = result.as_dict() # Then + # noinspection PyBroadException try: json.dumps(result_dict) except: diff --git a/snips_nlu/tests/test_slot_filler_data_augmentation.py b/snips_nlu/tests/test_slot_filler_data_augmentation.py index 3f0a6c196..2fe844823 100644 --- a/snips_nlu/tests/test_slot_filler_data_augmentation.py +++ b/snips_nlu/tests/test_slot_filler_data_augmentation.py @@ -277,20 +277,10 @@ def test_get_noise_iterator(self, mocked_get_subtitles, mocked_choice, language = Language.EN min_size, max_size = 2, 3 - def subtitles(language): - return ["a b c d", "e", "f g h"] - - mocked_get_subtitles.side_effect = subtitles - - def choice(_): - return 2 - - mocked_choice.side_effect = choice - - def randint(a, b): - return 0 - - mocked_randint.side_effect = randint + mocked_subtitles = ["a b c d", "e", "f g h"] + mocked_get_subtitles.return_value = mocked_subtitles + mocked_choice.return_value = 2 + mocked_randint.return_value = 0 it = get_noise_iterator(language, min_size, max_size) diff --git a/snips_nlu/tests/test_snips_intent_classifier.py b/snips_nlu/tests/test_snips_intent_classifier.py index d814dd2dc..975b33a74 100644 --- a/snips_nlu/tests/test_snips_intent_classifier.py +++ b/snips_nlu/tests/test_snips_intent_classifier.py @@ -104,12 +104,14 @@ def mock_from_dict(_): pickled_classifier = safe_pickle_dumps(intent_classifier.classifier) # Then + # noinspection PyBroadException try: dumped = json.dumps(classifier_dict).encode("utf-8") except: self.fail("SnipsIntentClassifier dict should be json serializable " "to utf-8") + # noinspection PyBroadException try: _ = SnipsIntentClassifier.from_dict(json.loads(dumped)) except: diff --git a/snips_nlu/utils.py b/snips_nlu/utils.py index 39b69b752..39c928374 100644 --- a/snips_nlu/utils.py +++ b/snips_nlu/utils.py @@ -16,6 +16,8 @@ def instance_from_dict(obj_dict): + if obj_dict is None: + return None module = obj_dict[MODULE_NAME] class_name = obj_dict[CLASS_NAME] obj_class = getattr(importlib.import_module(module), class_name)