diff --git a/CHANGELOG.md b/CHANGELOG.md index b6f238a59..4b72c467a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,20 @@ # Changelog All notable changes to this project will be documented in this file. +## [0.16.2] - 2018-08-08 +### Added +- `automatically_extensible` flag in dataset generation tool +- System requirements +- Reference to chatito tool in documentation + +### Changed +- Bump `snips-nlu-ontology` to `0.57.3` +- versions of dependencies are now defined more loosely + +### Fixed +- Issue with synonyms mapping +- Issue with `snips-nlu download-all-languages` CLI command + ## [0.16.1] - 2018-07-23 ### Added - Every processing unit can be persisted into (and loaded from) a `bytearray` @@ -113,6 +127,7 @@ several commands. - Fix compiling issue with `bindgen` dependency when installing from source - Fix issue in `CRFSlotFiller` when handling builtin entities +[0.16.2]: https://github.com/snipsco/snips-nlu/compare/0.16.1...0.16.2 [0.16.1]: https://github.com/snipsco/snips-nlu/compare/0.16.0...0.16.1 [0.16.0]: https://github.com/snipsco/snips-nlu/compare/0.15.1...0.16.0 [0.15.1]: https://github.com/snipsco/snips-nlu/compare/0.15.0...0.15.1 diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index a76bb5d25..01e3c1dc1 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -5,3 +5,4 @@ This is a list of everyone who has made significant contributions to Snips NLU, * `Alice Coucke `_ * `Josh Meyer `_ +* `Matthieu Brouillard `_ \ No newline at end of file diff --git a/README.rst b/README.rst index a45844046..7bf970a47 100644 --- a/README.rst +++ b/README.rst @@ -24,6 +24,13 @@ Snips NLU Check out our `blog post`_ to get more details about why we built Snips NLU and how it works under the hood. +System requirements +------------------- +- 64-bit Linux, MacOS >= 10.11, 64-bit Windows +- Python 2.7 or Python >= 3.4 +- RAM: Snips NLU will typically use between 100MB and 200MB of RAM, depending on the language and the size of the dataset. + + Installation ------------ diff --git a/docs/source/installation.rst b/docs/source/installation.rst index fa0622e49..806a0a1e2 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -3,6 +3,13 @@ Installation ============ +System requirements +------------------- +- 64-bit Linux, MacOS >= 10.11, 64-bit Windows +- Python 2.7 or Python >= 3.4 +- RAM: Snips NLU will typically use between 100MB and 200MB of RAM, depending on the language and the size of the dataset. + + Python Version -------------- diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 3d0ce5202..a2b16ef6b 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -29,9 +29,10 @@ parse as well as easy to read. We created a `sample dataset`_ that you can check to better understand the format. -You have two options to create your dataset. You can build it manually by -respecting the format used in the sample or alternatively you can use the -dataset creation CLI that is contained in the lib. +You have three options to create your dataset. You can build it manually by +respecting the format used in the sample, you can also use the dataset creation +CLI included in the lib, or alternatively you can use `chatito`_ a DSL +tool for dataset generation. We will go for the second option here and start by creating three files corresponding to our three intents and one entity file corresponding to the @@ -102,6 +103,15 @@ double quotes ``"``. If the value contains double quotes, it must be doubled to be escaped like this: ``"A value with a "","" in it"`` which corresponds to the actual value ``A value with a "," in it``. +.. Note:: + + By default entities are generated as :ref:`automatically extensible `, i.e. the recognition will accept additional values than the ones listed in the entity file. + This behavior can be changed by adding at the beginning of the entity file the following: + + .. code-block:: bash + + # automatically_extensible=false + We are now ready to generate our dataset: .. code-block:: bash @@ -364,3 +374,4 @@ Alternatively, you can persist/load the engine as a ``bytearray``: .. _sample dataset: https://github.com/snipsco/snips-nlu/blob/master/snips_nlu_samples/sample_dataset.json .. _default configurations: https://github.com/snipsco/snips-nlu/blob/master/snips_nlu/default_configs .. _english one: https://github.com/snipsco/snips-nlu/blob/master/snips_nlu/default_configs/config_en.py +.. _chatito: https://github.com/rodrigopivi/Chatito diff --git a/setup.py b/setup.py index eabd2285b..9e6f947de 100644 --- a/setup.py +++ b/setup.py @@ -13,44 +13,38 @@ about = dict() exec(f.read(), about) - with io.open(os.path.join(root, "README.rst"), encoding="utf8") as f: readme = f.read() -nlu_metrics_version = "0.12.0" - required = [ - "enum34==1.1.6", - "pathlib==1.0.1", + "enum34>=1.1,<2.0", "numpy==1.14.0", - "scipy==1.0.0", - "scikit-learn==0.19.1", - "sklearn-crfsuite==0.3.6", - "semantic_version==2.6.0", - "snips_nlu_utils==0.6.1", - "snips_nlu_ontology==0.57.2", - "num2words==0.5.6", - "plac==0.9.6", - "requests==2.18.4" + "scipy>=1.0,<2.0", + "scikit-learn>=0.19,<0.20", + "sklearn-crfsuite>=0.3.6,<0.4", + "semantic_version>=2.6,<3.0", + "snips_nlu_utils>=0.6.1,<0.7", + "snips_nlu_ontology==0.57.3", + "num2words>=0.5.6,<0.6", + "plac>=0.9.6,<1.0", + "requests>=2.0,<3.0", + "pathlib==1.0.1; python_version < '3.4'", ] extras_require = { "doc": [ - "sphinx==1.7.1", - "sphinxcontrib-napoleon==0.6.1", - "sphinx-rtd-theme==0.2.4" + "sphinx>=1.7,<2.0", + "sphinxcontrib-napoleon>=0.6.1,<0.7", + "sphinx-rtd-theme>=0.2.4,<0.3" ], "metrics": [ - "snips_nlu_metrics==%s" % nlu_metrics_version, + "snips_nlu_metrics>=0.13,<0.14", ], "test": [ - "mock==2.0.0", - "snips_nlu_metrics==%s" % nlu_metrics_version, - "pylint==1.8.2", - "coverage==4.4.2" - ], - "integration_test": [ - "snips_nlu_metrics==%s" % nlu_metrics_version, + "mock>=2.0,<3.0", + "snips_nlu_metrics>=0.13,<0.14", + "pylint>=1.8,<2.0", + "coverage>=4.4.2,<5.0" ] } diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py index 5c6c57e34..b1847715b 100644 --- a/snips_nlu/__about__.py +++ b/snips_nlu/__about__.py @@ -11,7 +11,7 @@ __email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai" __license__ = "Apache License, Version 2.0" -__version__ = "0.16.1" +__version__ = "0.16.2" __model_version__ = "0.16.0" __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download" diff --git a/snips_nlu/cli/dataset/entities.py b/snips_nlu/cli/dataset/entities.py index 07da20735..1ecec8744 100644 --- a/snips_nlu/cli/dataset/entities.py +++ b/snips_nlu/cli/dataset/entities.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import csv +import re from abc import ABCMeta, abstractmethod from pathlib import Path @@ -12,6 +13,7 @@ from snips_nlu.constants import ( VALUE, SYNONYMS, AUTOMATICALLY_EXTENSIBLE, USE_SYNONYMS, DATA) +AUTO_EXT_REGEX = re.compile(r'^#\sautomatically_extensible=(true|false)\s*$') class Entity(with_metaclass(ABCMeta, object)): def __init__(self, name): @@ -56,17 +58,23 @@ def from_file(cls, filepath): if six.PY2: it = list(utf_8_encoder(it)) reader = csv.reader(list(it)) + autoextent = True for row in reader: if six.PY2: row = [cell.decode("utf-8") for cell in row] value = row[0] + if reader.line_num == 1: + m = AUTO_EXT_REGEX.match(row[0]) + if m: + autoextent = not m.group(1).lower() == 'false' + continue if len(row) > 1: synonyms = row[1:] else: synonyms = [] utterances.append(EntityUtterance(value, synonyms)) - return cls(entity_name, utterances, automatically_extensible=True, - use_synonyms=True) + return cls(entity_name, utterances, + automatically_extensible=autoextent, use_synonyms=True) @property def json(self): diff --git a/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt b/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt new file mode 100644 index 000000000..243c4d290 --- /dev/null +++ b/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt @@ -0,0 +1,4 @@ +# automatically_extensible=false +new york,big apple +paris,city of lights +london \ No newline at end of file diff --git a/snips_nlu/cli/download.py b/snips_nlu/cli/download.py index b1b12613d..44e73f457 100644 --- a/snips_nlu/cli/download.py +++ b/snips_nlu/cli/download.py @@ -70,7 +70,7 @@ def download(resource_name, direct=False, def download_all_languages(*pip_args): """Download compatible resources for all supported languages""" for language in get_all_languages(): - download(language, *pip_args) + download(language, False, *pip_args) def _get_compatibility(): @@ -106,7 +106,7 @@ def _get_installed_languages(): for directory in DATA_PATH.iterdir(): if not directory.is_dir(): continue - with (directory / "metadata.json").open() as f: + with (directory / "metadata.json").open(encoding="utf8") as f: metadata = json.load(f) languages.add(metadata["language"]) return languages diff --git a/snips_nlu/cli/generate_dataset.py b/snips_nlu/cli/generate_dataset.py index aff954102..ffb0cea89 100644 --- a/snips_nlu/cli/generate_dataset.py +++ b/snips_nlu/cli/generate_dataset.py @@ -1,4 +1,4 @@ -from __future__ import unicode_literals +from __future__ import print_function, unicode_literals import json diff --git a/snips_nlu/cli/inference.py b/snips_nlu/cli/inference.py index 24f62bbe1..cfcfe2819 100644 --- a/snips_nlu/cli/inference.py +++ b/snips_nlu/cli/inference.py @@ -1,4 +1,4 @@ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function import json from builtins import input diff --git a/snips_nlu/cli/training.py b/snips_nlu/cli/training.py index a057d4423..1dd340bfa 100644 --- a/snips_nlu/cli/training.py +++ b/snips_nlu/cli/training.py @@ -1,4 +1,4 @@ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function import json from pathlib import Path diff --git a/snips_nlu/dataset.py b/snips_nlu/dataset.py index e9a627b8b..9df081ff1 100644 --- a/snips_nlu/dataset.py +++ b/snips_nlu/dataset.py @@ -1,9 +1,10 @@ from __future__ import division, unicode_literals import json -from builtins import str +from collections import Counter from copy import deepcopy +from builtins import str from future.utils import iteritems, itervalues from snips_nlu_ontology import get_all_languages @@ -97,16 +98,23 @@ def has_any_capitalization(entity_utterances, language): return False -def add_variation_if_needed(utterances, variation, utterance, language): - if not variation: - return utterances - all_variations = get_string_variations(variation, language) - for v in all_variations: - if v not in utterances: - utterances[v] = utterance +def add_entity_variations(utterances, entity_variations, entity_value): + utterances[entity_value] = entity_value + for variation in entity_variations[entity_value]: + if variation: + utterances[variation] = entity_value return utterances +def _extract_entity_values(entity): + values = set() + for ent in entity[DATA]: + values.add(ent[VALUE]) + if entity[USE_SYNONYMS]: + values.update(set(ent[SYNONYMS])) + return values + + def validate_and_format_custom_entity(entity, queries_entities, language): validate_type(entity, dict) mandatory_keys = [USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA] @@ -139,33 +147,59 @@ def validate_and_format_custom_entity(entity, queries_entities, language): formatted_entity[CAPITALIZE] = has_any_capitalization(queries_entities, language) - # Normalize - validated_data = dict() - for entry in entity[DATA]: - entry_value = entry[VALUE] - validated_data = add_variation_if_needed( - validated_data, entry_value, entry_value, language) - + validated_utterances = dict() + # Map original values an synonyms + for data in entity[DATA]: + ent_value = data[VALUE] + if not ent_value: + continue + validated_utterances[ent_value] = ent_value if use_synonyms: - for s in entry[SYNONYMS]: - validated_data = add_variation_if_needed( - validated_data, s, entry_value, language) - - formatted_entity[UTTERANCES] = validated_data - # Merge queries_entities - for value in queries_entities: - formatted_entity = add_entity_value_if_missing( - value, formatted_entity, language) + for s in data[SYNONYMS]: + if s and s not in validated_utterances: + validated_utterances[s] = ent_value + + # Add variations if not colliding + all_original_values = _extract_entity_values(entity) + variations = dict() + for data in entity[DATA]: + ent_value = data[VALUE] + values_to_variate = {ent_value} + if use_synonyms: + values_to_variate.update(set(data[SYNONYMS])) + variations[ent_value] = set( + v for value in values_to_variate + for v in get_string_variations(value, language)) + variation_counter = Counter( + [v for vars in itervalues(variations) for v in vars]) + non_colliding_variations = { + value: [ + v for v in variations if + v not in all_original_values and variation_counter[v] == 1 + ] + for value, variations in iteritems(variations) + } + for entry in entity[DATA]: + entry_value = entry[VALUE] + validated_utterances = add_entity_variations( + validated_utterances, non_colliding_variations, entry_value) + + # Merge queries entities + queries_entities_variations = { + ent: get_string_variations(ent, language) for ent in queries_entities + } + for original_ent, variations in iteritems(queries_entities_variations): + if not original_ent or original_ent in validated_utterances: + continue + validated_utterances[original_ent] = original_ent + for variation in variations: + if variation and variation not in validated_utterances: + validated_utterances[variation] = original_ent + formatted_entity[UTTERANCES] = validated_utterances return formatted_entity def validate_and_format_builtin_entity(entity, queries_entities): validate_type(entity, dict) return {UTTERANCES: set(queries_entities)} - - -def add_entity_value_if_missing(value, entity, language): - entity[UTTERANCES] = add_variation_if_needed(entity[UTTERANCES], value, - value, language) - return entity diff --git a/snips_nlu/intent_classifier/log_reg_classifier.py b/snips_nlu/intent_classifier/log_reg_classifier.py index 276429e1f..2ebd19112 100644 --- a/snips_nlu/intent_classifier/log_reg_classifier.py +++ b/snips_nlu/intent_classifier/log_reg_classifier.py @@ -185,7 +185,7 @@ def from_path(cls, path): raise OSError("Missing intent classifier model file: %s" % model_path.name) - with model_path.open() as f: + with model_path.open(encoding="utf8") as f: model_dict = json.load(f) return cls.from_dict(model_dict) diff --git a/snips_nlu/intent_parser/deterministic_intent_parser.py b/snips_nlu/intent_parser/deterministic_intent_parser.py index 9d5767777..e6aeb7bb1 100644 --- a/snips_nlu/intent_parser/deterministic_intent_parser.py +++ b/snips_nlu/intent_parser/deterministic_intent_parser.py @@ -206,7 +206,7 @@ def from_path(cls, path): raise OSError("Missing deterministic intent parser metadata file: " "%s" % metadata_path.name) - with metadata_path.open() as f: + with metadata_path.open(encoding="utf8") as f: metadata = json.load(f) return cls.from_dict(metadata) diff --git a/snips_nlu/intent_parser/probabilistic_intent_parser.py b/snips_nlu/intent_parser/probabilistic_intent_parser.py index d05f719c4..63c4b86d5 100644 --- a/snips_nlu/intent_parser/probabilistic_intent_parser.py +++ b/snips_nlu/intent_parser/probabilistic_intent_parser.py @@ -170,7 +170,7 @@ def from_path(cls, path): raise OSError("Missing probabilistic intent parser model file: " "%s" % model_path.name) - with model_path.open() as f: + with model_path.open(encoding="utf8") as f: model = json.load(f) parser = cls(config=cls.config_type.from_dict(model["config"])) diff --git a/snips_nlu/nlu_engine/nlu_engine.py b/snips_nlu/nlu_engine/nlu_engine.py index 28bab524f..8420592d8 100644 --- a/snips_nlu/nlu_engine/nlu_engine.py +++ b/snips_nlu/nlu_engine/nlu_engine.py @@ -212,7 +212,7 @@ def from_path(cls, path): raise OSError("Missing nlu engine model file: %s" % model_path.name) - with model_path.open() as f: + with model_path.open(encoding="utf8") as f: model = json.load(f) model_version = model.get("model_version") if model_version is None or model_version != __model_version__: diff --git a/snips_nlu/pipeline/processing_unit.py b/snips_nlu/pipeline/processing_unit.py index d003ea73a..d4ee1c90a 100644 --- a/snips_nlu/pipeline/processing_unit.py +++ b/snips_nlu/pipeline/processing_unit.py @@ -140,7 +140,7 @@ def load_processing_unit(unit_path): """Load a :class:`ProcessingUnit` from a persisted processing unit directory""" unit_path = Path(unit_path) - with (unit_path / "metadata.json").open() as f: + with (unit_path / "metadata.json").open(encoding="utf8") as f: metadata = json.load(f) unit = _get_unit_type(metadata["unit_name"]) return unit.from_path(unit_path) diff --git a/snips_nlu/resources.py b/snips_nlu/resources.py index 03659157c..1bd2aa80c 100644 --- a/snips_nlu/resources.py +++ b/snips_nlu/resources.py @@ -50,7 +50,7 @@ def load_resources(name): def load_resources_from_dir(resources_dir): - with (resources_dir / "metadata.json").open() as f: + with (resources_dir / "metadata.json").open(encoding="utf8") as f: metadata = json.load(f) language = metadata["language"] if language in _RESOURCES: @@ -78,7 +78,7 @@ def load_resources_from_dir(resources_dir): def get_resources_sub_directory(resources_dir): resources_dir = Path(resources_dir) - with (resources_dir / "metadata.json").open() as f: + with (resources_dir / "metadata.json").open(encoding="utf8") as f: metadata = json.load(f) resource_name = metadata["name"] version = metadata["version"] diff --git a/snips_nlu/slot_filler/crf_slot_filler.py b/snips_nlu/slot_filler/crf_slot_filler.py index e3aeba2d2..5ebd9a763 100644 --- a/snips_nlu/slot_filler/crf_slot_filler.py +++ b/snips_nlu/slot_filler/crf_slot_filler.py @@ -346,7 +346,7 @@ def from_path(cls, path): raise OSError("Missing slot filler model file: %s" % model_path.name) - with model_path.open() as f: + with model_path.open(encoding="utf8") as f: model = json.load(f) slot_filler_config = cls.config_type.from_dict(model["config"]) diff --git a/snips_nlu/tests/test_cli.py b/snips_nlu/tests/test_cli.py index 8d58e99d7..3cba2e5a6 100644 --- a/snips_nlu/tests/test_cli.py +++ b/snips_nlu/tests/test_cli.py @@ -23,6 +23,7 @@ class TestCLI(SnipsTest): # pylint: disable=protected-access def setUp(self): + super(TestCLI, self).setUp() if not self.fixture_dir.exists(): self.fixture_dir.mkdir() @@ -181,6 +182,40 @@ def test_should_generate_entity_from_file(self): } self.assertDictEqual(expected_entity_dict, entity_dict) + def test_should_generate_entity_from_file_with_autoextensible(self): + # Given + examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" + entity_file = examples_path / "entity_location_autoextent_false.txt" + + # When + entity_dataset = CustomEntity.from_file(entity_file) + entity_dict = entity_dataset.json + + # Then + expected_entity_dict = { + "automatically_extensible": False, + "data": [ + { + "synonyms": [ + "big apple" + ], + "value": "new york" + }, + { + "synonyms": [ + "city of lights" + ], + "value": "paris" + }, + { + "synonyms": [], + "value": "london" + } + ], + "use_synonyms": True + } + self.assertDictEqual(expected_entity_dict, entity_dict) + def test_should_generate_dataset_from_files(self): # Given examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" diff --git a/snips_nlu/tests/test_dataset.py b/snips_nlu/tests/test_dataset.py index 320190130..1db610a3a 100644 --- a/snips_nlu/tests/test_dataset.py +++ b/snips_nlu/tests/test_dataset.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from builtins import str - from mock import mock from snips_nlu.constants import ( @@ -37,8 +36,8 @@ def test_missing_intent_key_should_raise_exception(self): # When/Then with self.assertRaises(KeyError) as ctx: validate_and_format_dataset(dataset) - self.assertEqual(str(ctx.exception.args[0]), - "Expected chunk to have key: 'slot_name'") + self.assertEqual("Expected chunk to have key: 'slot_name'", + str(ctx.exception.args[0])) def test_unknown_entity_should_raise_exception(self): # Given @@ -72,8 +71,7 @@ def test_unknown_entity_should_raise_exception(self): # When/Then with self.assertRaises(KeyError) as ctx: validate_and_format_dataset(dataset) - self.assertEqual(str(ctx.exception.args[0]), - "Expected entities to have key: 'unknown_entity'") + self.assertEqual("Expected entities to have key: 'unknown_entity'", str(ctx.exception.args[0])) def test_missing_entity_key_should_raise_exception(self): # Given @@ -92,8 +90,7 @@ def test_missing_entity_key_should_raise_exception(self): # When/Then with self.assertRaises(KeyError) as ctx: validate_and_format_dataset(dataset) - self.assertEqual(str(ctx.exception.args[0]), - "Expected entity to have key: 'use_synonyms'") + self.assertEqual("Expected entity to have key: 'use_synonyms'", str(ctx.exception.args[0])) def test_invalid_language_should_raise_exception(self): # Given @@ -107,7 +104,7 @@ def test_invalid_language_should_raise_exception(self): # When/Then with self.assertRaises(ValueError) as ctx: validate_and_format_dataset(dataset) - self.assertEqual(str(ctx.exception.args[0]), "Unknown language: 'eng'") + self.assertEqual("Unknown language: 'eng'", str(ctx.exception.args[0])) @mock.patch("snips_nlu.dataset.get_string_variations") def test_should_format_dataset_by_adding_synonyms( @@ -158,7 +155,7 @@ def mock_get_string_variations(variation, language): dataset = validate_and_format_dataset(dataset) # Then - self.assertDictEqual(dataset, expected_dataset) + self.assertDictEqual(expected_dataset, dataset) @mock.patch("snips_nlu.dataset.get_string_variations") def test_should_format_dataset_by_adding_entity_values( @@ -269,7 +266,7 @@ def mock_get_string_variations(variation, language): dataset = validate_and_format_dataset(dataset) # Then - self.assertEqual(dataset, expected_dataset) + self.assertEqual(expected_dataset, dataset) @mock.patch("snips_nlu.dataset.get_string_variations") def test_should_add_missing_reference_entity_values_when_not_use_synonyms( @@ -377,7 +374,7 @@ def mock_get_string_variations(variation, language): dataset = validate_and_format_dataset(dataset) # Then - self.assertEqual(dataset, expected_dataset) + self.assertEqual(expected_dataset, dataset) def test_should_not_require_data_for_builtin_entities(self): # Given @@ -521,7 +518,7 @@ def mock_get_string_variations(variation, language): dataset = validate_and_format_dataset(dataset) # Then - self.assertEqual(dataset, expected_dataset) + self.assertEqual(expected_dataset, dataset) @mock.patch("snips_nlu.dataset.get_string_variations") def test_should_add_capitalize_field( @@ -691,7 +688,7 @@ def mock_get_string_variations(variation, language): dataset = validate_and_format_dataset(dataset) # Then - self.assertDictEqual(dataset, expected_dataset) + self.assertDictEqual(expected_dataset, dataset) @mock.patch("snips_nlu.dataset.get_string_variations") def test_should_normalize_synonyms( @@ -749,6 +746,7 @@ def mock_get_string_variations(variation, language): "utterances": { "ëntity": "ëNtity", "Ëntity": "ëNtity", + "ëNtity": "ëNtity" }, "automatically_extensible": True, "capitalize": False @@ -763,7 +761,7 @@ def mock_get_string_variations(variation, language): dataset = validate_and_format_dataset(dataset) # Then - self.assertDictEqual(dataset, expected_dataset) + self.assertDictEqual(expected_dataset, dataset) @mock.patch("snips_nlu.dataset.get_string_variations") def test_dataset_should_handle_synonyms( @@ -809,3 +807,65 @@ def mock_get_string_variations(variation, language): # Then self.assertDictEqual(dataset[ENTITIES], expected_entities) + + def test_should_not_avoid_synomyms_variations_collision(self): + # Given + dataset = { + "intents": { + "dummy_but_tricky_intent": { + "utterances": [ + { + "data": [ + { + "text": "dummy_value", + "entity": "dummy_but_tricky_entity", + "slot_name": "dummy_but_tricky_slot" + } + ] + } + ] + } + }, + "entities": { + "dummy_but_tricky_entity": { + "data": [ + { + "value": "a", + "synonyms": [ + "favorïte" + ] + }, + { + "value": "b", + "synonyms": [ + "favorite" + ] + } + ], + "use_synonyms": True, + "automatically_extensible": False + } + }, + "language": "en", + "snips_nlu_version": "0.15.0" + } + + # When + dataset = validate_and_format_dataset(dataset) + + # Then + entity = dataset["entities"]["dummy_but_tricky_entity"] + expected_utterances = { + "A": "a", + "B": "b", + "DummyValue": "dummy_value", + "Dummy_Value": "dummy_value", + "Favorïte": "a", + "a": "a", + "b": "b", + "dummy_value": "dummy_value", + "dummyvalue": "dummy_value", + "favorite": "b", + "favorïte": "a" + } + self.assertDictEqual(expected_utterances, entity["utterances"]) diff --git a/snips_nlu/tests/utils.py b/snips_nlu/tests/utils.py index 4ea44722f..3d3db3061 100644 --- a/snips_nlu/tests/utils.py +++ b/snips_nlu/tests/utils.py @@ -23,8 +23,7 @@ class SnipsTest(TestCase): - def __init__(self, methodName='runTest'): - super(SnipsTest, self).__init__(methodName) + def setUp(self): for l in get_all_languages(): load_resources(l) @@ -39,14 +38,14 @@ def fail_if_exception(self, msg): def assertJsonContent(self, json_path, expected_dict): if not json_path.exists(): self.fail("Json file not found: %s" % str(json_path)) - with json_path.open() as f: + with json_path.open(encoding="utf8") as f: data = json.load(f) self.assertDictEqual(expected_dict, data) def assertFileContent(self, path, expected_content): if not path.exists(): self.fail("File not found: %s" % str(path)) - with path.open() as f: + with path.open(encoding="utf8") as f: data = f.read() self.assertEqual(expected_content, data) @@ -68,6 +67,7 @@ class FixtureTest(SnipsTest): # pylint: disable=protected-access def setUp(self): + super(FixtureTest, self).setUp() if not self.fixture_dir.exists(): self.fixture_dir.mkdir() diff --git a/tox.ini b/tox.ini index 442d3e892..37ccd1282 100644 --- a/tox.ini +++ b/tox.ini @@ -31,7 +31,7 @@ setenv= basepython = python3.6 skip_install = true commands = - pip install -e ".[test,integration_test]" + pip install -e ".[test]" snips-nlu download snips_nlu_de-0.2.0 --direct snips-nlu download snips_nlu_en-0.2.0 --direct snips-nlu download snips_nlu_es-0.2.0 --direct