Skip to content

Commit

Permalink
Merge pull request #199 from snipsco/release/0.3.4
Browse files Browse the repository at this point in the history
Bump version number
  • Loading branch information
Adrien Ball authored May 3, 2017
2 parents f74211f + 9b858a3 commit 71f1acd
Show file tree
Hide file tree
Showing 31 changed files with 1,032 additions and 569 deletions.
3 changes: 3 additions & 0 deletions snips_nlu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from duckling import core

from snips_nlu.resources import load_resources
from snips_nlu.utils import ROOT_PATH, PACKAGE_NAME

core.load()
Expand All @@ -11,3 +12,5 @@

with io.open(os.path.join(ROOT_PATH, PACKAGE_NAME, VERSION_FILE_NAME)) as f:
__version__ = f.readline().strip()

load_resources()
2 changes: 1 addition & 1 deletion snips_nlu/__version__
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.3
0.3.4
61 changes: 33 additions & 28 deletions snips_nlu/constants.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,33 @@
INTENT_NAME = u"intent_name"
PROBABILITY = u"probability"
PARSED_INTENT = u"intent"
PARSED_SLOTS = u"slots"
TEXT = u"text"
AUTOMATICALLY_EXTENSIBLE = u"automatically_extensible"
USE_SYNONYMS = u"use_synonyms"
SYNONYMS = u"synonyms"
DATA = u"data"
INTENTS = u"intents"
ENTITIES = u"entities"
ENTITY = u"entity"
SLOT_NAME = u"slot_name"
UTTERANCES = u"utterances"
ENGINE_TYPE = u"engineType"
CUSTOM_ENGINE = u"regex"
BUILTIN_ENGINE = u"tensorflow"
LANGUAGE = u"language"
MATCH_RANGE = u"range"
VALUE = u"value"
CUSTOM_PARSERS = u"custom_parsers"
BUILTIN_PARSER = u"builtin_parser"
BUILTIN_PATH = u"builtin_path"
BUILTIN_BINARY = u"builtin_binary"
LABEL = u"label"
DUCKLING_DIM = u"duckling_dim"
NGRAM = u"ngram"
TOKEN_INDEXES = u"token_indexes"
from __future__ import unicode_literals

INTENT_NAME = "intent_name"
PROBABILITY = "probability"
PARSED_INTENT = "intent"
PARSED_SLOTS = "slots"
TEXT = "text"
AUTOMATICALLY_EXTENSIBLE = "automatically_extensible"
USE_SYNONYMS = "use_synonyms"
SYNONYMS = "synonyms"
DATA = "data"
INTENTS = "intents"
ENTITIES = "entities"
ENTITY = "entity"
SLOT_NAME = "slot_name"
UTTERANCES = "utterances"
ENGINE_TYPE = "engineType"
CUSTOM_ENGINE = "regex"
BUILTIN_ENGINE = "tensorflow"
LANGUAGE = "language"
MATCH_RANGE = "range"
VALUE = "value"
BUILTIN_PARSER = "builtin_parser"
BUILTIN_PATH = "builtin_path"
BUILTIN_BINARY = "builtin_binary"
LABEL = "label"
DUCKLING_DIM = "duckling_dim"
NGRAM = "ngram"
TOKEN_INDEXES = "token_indexes"
GAZETTEERS = "gazetteers"
STOP_WORDS = "stop_words"
SUBTITLES = "subtitles"
WORD_CLUSTERS = "word_clusters"
7 changes: 4 additions & 3 deletions snips_nlu/intent_classifier/data_augmentation.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import numpy as np
from uuid import uuid4

import numpy as np

from snips_nlu.constants import INTENTS, UTTERANCES, DATA
from snips_nlu.dataset import get_text_from_chunks
from snips_nlu.intent_classifier.intent_classifier_resources import \
get_subtitles
from snips_nlu.preprocessing import stem_sentence
from snips_nlu.resources import get_subtitles

NOISE_NAME = str(uuid4()).decode()

Expand Down
2 changes: 1 addition & 1 deletion snips_nlu/intent_classifier/feature_extraction.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2

from intent_classifier_resources import get_stop_words
from snips_nlu.resources import get_stop_words
from snips_nlu.languages import Language
from snips_nlu.utils import ensure_string, safe_pickle_dumps, safe_pickle_loads
from snips_nlu.tokenization import tokenize_light
Expand Down
31 changes: 0 additions & 31 deletions snips_nlu/intent_classifier/intent_classifier_resources.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from intent_parser import IntentParser
from snips_nlu.constants import (DATA, INTENTS, SLOT_NAME, UTTERANCES, ENTITY,
CUSTOM_ENGINE)
from snips_nlu.constants import (DATA, INTENTS, CUSTOM_ENGINE)
from snips_nlu.dataset import filter_dataset
from snips_nlu.languages import Language
from snips_nlu.result import ParsedSlot
from snips_nlu.slot_filler.crf_tagger import CRFTagger
from snips_nlu.slot_filler.crf_utils import (tags_to_slots,
utterance_to_sample)
Expand Down Expand Up @@ -41,21 +39,10 @@ def default_data_augmentation_config(language):
return DataAugmentationConfig()


def get_slot_name_to_entity_mapping(dataset):
slot_name_to_entity = dict()
for intent in dataset[INTENTS].values():
for utterance in intent[UTTERANCES]:
for chunk in utterance[DATA]:
if SLOT_NAME in chunk:
slot_name_to_entity[chunk[SLOT_NAME]] = chunk[ENTITY]
return slot_name_to_entity


class CRFIntentParser(IntentParser):
class ProbabilisticIntentParser(IntentParser):
def __init__(self, language, intent_classifier, crf_taggers,
slot_name_to_entity_mapping=None,
data_augmentation_config=None):
super(CRFIntentParser, self).__init__()
slot_name_to_entity_mapping, data_augmentation_config=None):
super(ProbabilisticIntentParser, self).__init__()
self.language = language
self.intent_classifier = intent_classifier
self._crf_taggers = None
Expand All @@ -78,29 +65,28 @@ def crf_taggers(self, value):

def get_intent(self, text):
if not self.fitted:
raise ValueError("CRFIntentParser must be fitted before "
raise ValueError("ProbabilisticIntentParser must be fitted before "
"`get_intent` is called")
return self.intent_classifier.get_intent(text)

def get_slots(self, text, intent=None):
if intent is None:
raise ValueError("intent can't be None")
if not self.fitted:
raise ValueError("CRFIntentParser must be fitted before "
raise ValueError("ProbabilisticIntentParser must be fitted before "
"`get_slots` is called")
if intent not in self.crf_taggers:
raise KeyError("Invalid intent '%s'" % intent)

tokens = tokenize(text)
if len(tokens) == 0:
return []
intent_slots_mapping = self.slot_name_to_entity_mapping[intent]
tagger = self.crf_taggers[intent]

tags = tagger.get_tags(tokens)
slots = tags_to_slots(tokens, tags,
tagging_scheme=tagger.tagging_scheme)
return [ParsedSlot(match_range=s["range"],
value=text[s["range"][0]:s["range"][1]],
entity=self.slot_name_to_entity_mapping[
s[SLOT_NAME]],
slot_name=s[SLOT_NAME]) for s in slots]
slots = tags_to_slots(text, tokens, tags, tagger.tagging_scheme,
intent_slots_mapping)
return slots

@property
def fitted(self):
Expand All @@ -109,8 +95,6 @@ def fitted(self):

def fit(self, dataset):
custom_dataset = filter_dataset(dataset, CUSTOM_ENGINE)
self.slot_name_to_entity_mapping = get_slot_name_to_entity_mapping(
custom_dataset)
self.intent_classifier = self.intent_classifier.fit(dataset)
for intent_name in custom_dataset[INTENTS]:
augmented_intent_utterances = augment_utterances(
Expand Down
30 changes: 9 additions & 21 deletions snips_nlu/intent_parser/regex_intent_parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import operator
import re

from snips_nlu.built_in_entities import BuiltInEntity
from snips_nlu.constants import (TEXT, USE_SYNONYMS, SYNONYMS, DATA, INTENTS,
ENTITIES, SLOT_NAME, UTTERANCES, VALUE,
ENTITY, CUSTOM_ENGINE)
from snips_nlu.built_in_entities import BuiltInEntity
from snips_nlu.dataset import filter_dataset
from snips_nlu.intent_parser.intent_parser import IntentParser
from snips_nlu.result import (IntentClassificationResult,
Expand Down Expand Up @@ -86,8 +85,9 @@ def get_joined_entity_utterances(dataset):
for syn in entry[SYNONYMS]]
else:
utterances = [entry[VALUE] for entry in entity[DATA]]
utterances_patterns = [re.escape(e) for e in utterances]
joined_entity_utterances[entity_name] = r"|".join(
sorted([re.escape(e) for e in utterances], key=len, reverse=True))
sorted(utterances_patterns, key=len, reverse=True))
return joined_entity_utterances


Expand Down Expand Up @@ -146,24 +146,12 @@ def get_intent(self, text):
if not self.fitted:
raise AssertionError("RegexIntentParser must be fitted before "
"calling `get_entities`")
entities_per_intent = dict()
for intent in self.regexes_per_intent.keys():
entities_per_intent[intent] = self.get_slots(text, intent)

intents_probas = dict()
total_nb_entities = sum(
len(entities) for entities in entities_per_intent.values())
# TODO: handle intents without slots
if total_nb_entities == 0:
return None
for intent_name, entities in entities_per_intent.iteritems():
intents_probas[intent_name] = float(len(entities)) / float(
total_nb_entities)

top_intent, top_proba = max(intents_probas.items(),
key=operator.itemgetter(1))
return IntentClassificationResult(intent_name=top_intent,
probability=top_proba)
for intent, regexes in self.regexes_per_intent.iteritems():
for regex in regexes:
if regex.match(text) is not None:
return IntentClassificationResult(intent_name=intent,
probability=1.0)
return None

def get_slots(self, text, intent=None):
if not self.fitted:
Expand Down
Loading

0 comments on commit 71f1acd

Please sign in to comment.