diff --git a/Jenkinsfile b/Jenkinsfile index 8144e8f6ed..86ca20f9d7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -11,7 +11,7 @@ node('gpu') { sh """ virtualenv --python=python3 ".venv-$BUILD_NUMBER" . .venv-$BUILD_NUMBER/bin/activate - sed -ri 's/^ *tensorflow *(=|<|>|\$)/tensorflow-gpu\\1/g' requirements.txt + sed -ri 's/^\\s*tensorflow\\s*(=|<|>|;|\$)/tensorflow-gpu\\1/g' requirements.txt sed -i "s/stream=True/stream=False/g" deeppavlov/core/data/utils.py python setup.py develop pip install http://lnsigo.mipt.ru/export/en_core_web_sm-2.0.0.tar.gz diff --git a/MANIFEST.in b/MANIFEST.in index 1f23f03438..f21bad08ff 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include README.MD include LICENSE include requirements.txt +recursive-include requirements *.txt recursive-include deeppavlov *.json recursive-include deeppavlov *.md recursive-include utils *.json \ No newline at end of file diff --git a/README.md b/README.md index ef2c596460..d1f791e3a9 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,63 @@ [![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/deepmipt/DeepPavlov/blob/master/LICENSE) ![Python 3.6](https://img.shields.io/badge/python-3.6-green.svg) -**We are in a really early Alpha release. You should be ready for hard adventures. -In version 0.0.5 we updraded to TensorFlow 1.8, please re-download our pre-trained models.** +_We are still in a really early Alpha release._ +__In version 0.0.6 everything from package `deeppavlov.skills` except `deeppavlov.skills.pattern_matching_skill` was moved to `deeppavlov.models` so your imports might break__ + DeepPavlov is an open-source conversational AI library built on [TensorFlow](https://www.tensorflow.org/) and [Keras](https://keras.io/). It is designed for * development of production ready chat-bots and complex conversational systems, * NLP and dialog systems research. - -Our goal is to enable AI-application developers and researchers with: - * set of pre-trained NLP models, pre-defined dialog system components (ML/DL/Rule-based) and pipeline templates; - * a framework for implementing and testing their own dialog models; - * tools for application integration with adjacent infrastructure (messengers, helpdesk software etc.); - * benchmarking environment for conversational models and uniform access to relevant datasets. + +# Hello Bot in DeepPavlov + +Import key components to build HelloBot. +```python +from deeppavlov.core.agent import Agent, HighestConfidenceSelector +from deeppavlov.skills.pattern_matching_skill import PatternMatchingSkill +``` + +Create skills as pre-defined responses for a user's input containing specific keywords. Every skill returns response and confidence. +```python +hello = PatternMatchingSkill(responses=['Hello world! :)'], patterns=["hi", "hello", "good day"]) +bye = PatternMatchingSkill(['Goodbye world! :(', 'See you around.'], ["bye", "chao", "see you"]) +fallback = PatternMatchingSkill(["I don't understand, sorry :/", 'I can say "Hello world!" 8)']) +``` + +Agent executes skills and then takes response from the skill with the highest confidence. +```python +HelloBot = Agent([hello, bye, fallback], skills_selector=HighestConfidenceSelector()) +``` + +Give the floor to the HelloBot! +```python +print(HelloBot(['Hello!', 'Boo...', 'Bye.'])) +``` + +[Jupyther notebook with HelloBot example.](examples/hello_bot.ipynb) + + +# Installation + +0. Currently we support only `Linux` platform and `Python 3.6` (**`Python 3.5` is not supported!**) + +1. Create a virtual environment with `Python 3.6` + ``` + virtualenv env + ``` +2. Activate the environment. + ``` + source ./env/bin/activate + ``` +3. Clone the repo and `cd` to project root + ``` + git clone https://github.com/deepmipt/DeepPavlov.git + cd DeepPavlov + ``` +4. Install basic requirements: + ``` + python setup.py develop + ``` # Demo @@ -20,17 +65,11 @@ Demo of selected features is available at [demo.ipavlov.ai](https://demo.ipavlov # Conceptual overview - - - +Our goal is to enable AI-application developers and researchers with: + * set of pre-trained NLP models, pre-defined dialog system components (ML/DL/Rule-based) and pipeline templates; + * a framework for implementing and testing their own dialog models; + * tools for application integration with adjacent infrastructure (messengers, helpdesk software etc.); + * benchmarking environment for conversational models and uniform access to relevant datasets.

@@ -56,34 +95,15 @@ DeepPavlov is built on top of machine learning frameworks [TensorFlow](https://w --- -# Installation -0. Currently we support only `Linux` platform and `Python 3.6` (**`Python 3.5` is not supported!**) - -1. Create a virtual environment with `Python 3.6` - ``` - virtualenv env - ``` -2. Activate the environment. - ``` - source ./env/bin/activate - ``` -3. Clone the repo and `cd` to project root - ``` - git clone https://github.com/deepmipt/DeepPavlov.git - cd DeepPavlov - ``` -4. Install the requirements: - ``` - python setup.py develop - ``` -5. Install `spacy` dependencies: - ``` - python -m spacy download en - ``` - # Quick start -To use our pre-trained models, you should first download them: +To use our pre-trained models, you should first install their requirements: +``` +python -m deeppavlov install +``` + + +Then download the models and data for them: ``` python -m deeppavlov download ``` @@ -111,28 +131,6 @@ Every line of input text will be used as a pipeline input parameter, so one exam as many input parameters your pipeline expects. You can also specify batch size with `-b` or `--batch-size` parameter. -Available model configs are: - -- ```deeppavlov/configs/go_bot/*.json``` - -- ```deeppavlov/configs/intents/*.json``` - -- ```deeppavlov/configs/morpho_tagger/*.json``` - -- ```deeppavlov/configs/ner/*.json``` - -- ```deeppavlov/configs/odqa/*.json``` - -- ```deeppavlov/configs/ranking/*.json``` - -- ```deeppavlov/configs/sentiment/*.json``` - -- ```deeppavlov/configs/seq2seq_go_bot/*.json``` - -- ```deeppavlov/configs/spelling_correction/*.json``` - -- ```deeppavlov/configs/squad/*.json``` - # Features | Component | Description | @@ -140,23 +138,22 @@ Available model configs are: | [NER component](deeppavlov/models/ner/README.md) | Based on neural Named Entity Recognition network. The NER component reproduces architecture from the paper [Application of a Hybrid Bi-LSTM-CRF model to the task of Russian Named Entity Recognition](https://arxiv.org/pdf/1709.09686.pdf) which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01360.pdf. | | [Slot filling components](deeppavlov/models/slotfill/README.md) | Based on fuzzy Levenshtein search to extract normalized slot values from text. The components either rely on NER results or perform needle in haystack search.| | [Classification component](deeppavlov/models/classifiers/intents/README.md) | Component for classification tasks (intents, sentiment, etc). Based on shallow-and-wide Convolutional Neural Network architecture from [Kim Y. Convolutional neural networks for sentence classification – 2014](https://arxiv.org/pdf/1408.5882) and others. The model allows multilabel classification of sentences. | +| [Goal-oriented bot](deeppavlov/models/go_bot/README.md) | Based on Hybrid Code Networks (HCNs) architecture from [Jason D. Williams, Kavosh Asadi, Geoffrey Zweig, Hybrid Code Networks: practical and efficient end-to-end dialog control with supervised and reinforcement learning – 2017](https://arxiv.org/abs/1702.03274). It allows to predict responses in goal-oriented dialog. The model is customizable: embeddings, slot filler and intent classifier can switched on and off on demand. | +| [Seq2seq goal-oriented bot](deeppavlov/models/seq2seq_go_bot/README.md) | Dialogue agent predicts responses in a goal-oriented dialog and is able to handle multiple domains (pretrained bot allows calendar scheduling, weather information retrieval, and point-of-interest navigation). The model is end-to-end differentiable and does not need to explicitly model dialogue state or belief trackers. | | [Automatic spelling correction component](deeppavlov/models/spelling_correction/README.md) | Pipelines that use candidates search in a static dictionary and an ARPA language model to correct spelling errors. | | [Ranking component](deeppavlov/models/ranking/README.md) | Based on [LSTM-based deep learning models for non-factoid answer selection](https://arxiv.org/abs/1511.04108). The model performs ranking of responses or contexts from some database by their relevance for the given context. | | [Question Answering component](deeppavlov/models/squad/README.md) | Based on [R-NET: Machine Reading Comprehension with Self-matching Networks](https://www.microsoft.com/en-us/research/publication/mrc/). The model solves the task of looking for an answer on a question in a given context ([SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) task format). | | [Morphological tagging component](deeppavlov/models/morpho_tagger/README.md) | Based on character-based approach to morphological tagging [Heigold et al., 2017. An extensive empirical evaluation of character-based morphological tagging for 14 languages](http://www.aclweb.org/anthology/E17-1048). A state-of-the-art model for Russian and several other languages. Model assigns morphological tags in UD format to sequences of words.| | **Skills** | | -| [Goal-oriented bot](deeppavlov/skills/go_bot/README.md) | Based on Hybrid Code Networks (HCNs) architecture from [Jason D. Williams, Kavosh Asadi, Geoffrey Zweig, Hybrid Code Networks: practical and efficient end-to-end dialog control with supervised and reinforcement learning – 2017](https://arxiv.org/abs/1702.03274). It allows to predict responses in goal-oriented dialog. The model is customizable: embeddings, slot filler and intent classifier can switched on and off on demand. | -| [Seq2seq goal-oriented bot](deeppavlov/skills/seq2seq_go_bot/README.md) | Dialogue agent predicts responses in a goal-oriented dialog and is able to handle multiple domains (pretrained bot allows calendar scheduling, weather information retrieval, and point-of-interest navigation). The model is end-to-end differentiable and does not need to explicitly model dialogue state or belief trackers. | |[ODQA](deeppavlov/skills/odqa/README.md) | An open domain question answering skill. The skill accepts free-form questions about the world and outputs an answer based on its Wikipedia knowledge.| +| **Parameters Evolution** | | +| [Parameters evolution for models](deeppavlov/models/evolution/README.md) | Implementation of parameters evolution for DeepPavlov models that requires only some small changes in a config file. | | **Embeddings** | | | [Pre-trained embeddings for the Russian language](pretrained-vectors.md) | Word vectors for the Russian language trained on joint [Russian Wikipedia](https://ru.wikipedia.org/wiki/%D0%97%D0%B0%D0%B3%D0%BB%D0%B0%D0%B2%D0%BD%D0%B0%D1%8F_%D1%81%D1%82%D1%80%D0%B0%D0%BD%D0%B8%D1%86%D0%B0) and [Lenta.ru](https://lenta.ru/) corpora. | -# Basic examples - -View video demo of deployment of a goal-oriented bot and a slot-filling model with Telegram UI -[![Alt text for your video](https://img.youtube.com/vi/yzoiCa_sMuY/0.jpg)](https://youtu.be/yzoiCa_sMuY) - +# Examples of some components + * Run goal-oriented bot with Telegram interface: ``` python -m deeppavlov interactbot deeppavlov/configs/go_bot/gobot_dstc2.json -d -t @@ -185,6 +182,12 @@ View video demo of deployment of a goal-oriented bot and a slot-filling model wi ``` python -m deeppavlov predict deeppavlov/configs/intents/intents_snips.json -d --batch-size 15 < /data/in.txt > /data/out.txt ``` + + View [video demo](https://youtu.be/yzoiCa_sMuY) of deployment of a goal-oriented bot and a slot-filling model with Telegram UI + +# Tutorials + +Jupyter notebooks and videos explaining how to use DeepPalov for different tasks can be found in [/examples/tutorials/](examples/tutorials/) --- @@ -239,7 +242,7 @@ View video demo of deployment of a goal-oriented bot and a slot-filling model wi -## Config +## Config of component An NLP pipeline config is a JSON file that contains one required element `chainer`: diff --git a/deeppavlov/__init__.py b/deeppavlov/__init__.py index f7e3052462..df3403b16b 100644 --- a/deeppavlov/__init__.py +++ b/deeppavlov/__init__.py @@ -18,92 +18,3 @@ # check version import sys assert sys.hexversion >= 0x3060000, 'Does not work in python3.5 or lower' - -import deeppavlov.core.models.keras_model -import deeppavlov.core.data.vocab -import deeppavlov.core.data.simple_vocab -import deeppavlov.core.data.sqlite_database -import deeppavlov.dataset_readers.babi_reader -import deeppavlov.dataset_readers.dstc2_reader -import deeppavlov.dataset_readers.kvret_reader -import deeppavlov.dataset_readers.conll2003_reader -import deeppavlov.dataset_readers.typos_reader -import deeppavlov.dataset_readers.basic_classification_reader -import deeppavlov.dataset_readers.squad_dataset_reader -import deeppavlov.dataset_readers.morphotagging_dataset_reader - -import deeppavlov.dataset_iterators.dialog_iterator -import deeppavlov.dataset_iterators.kvret_dialog_iterator -import deeppavlov.dataset_iterators.dstc2_ner_iterator -import deeppavlov.dataset_iterators.dstc2_intents_iterator -import deeppavlov.dataset_iterators.typos_iterator -import deeppavlov.dataset_iterators.basic_classification_iterator -import deeppavlov.dataset_iterators.squad_iterator -import deeppavlov.dataset_iterators.sqlite_iterator -import deeppavlov.dataset_iterators.morphotagger_iterator - -import deeppavlov.models.classifiers.intents.intent_model -import deeppavlov.models.commutators.random_commutator -import deeppavlov.models.embedders.fasttext_embedder -import deeppavlov.models.embedders.dict_embedder -import deeppavlov.models.embedders.glove_embedder -import deeppavlov.models.embedders.bow_embedder -import deeppavlov.models.spelling_correction.brillmoore.error_model -import deeppavlov.models.spelling_correction.levenstein.searcher_component -import deeppavlov.models.spelling_correction.electors.kenlm_elector -import deeppavlov.models.spelling_correction.electors.top1_elector -import deeppavlov.models.trackers.hcn_at -import deeppavlov.models.trackers.hcn_et -import deeppavlov.models.preprocessors.str_lower -import deeppavlov.models.preprocessors.squad_preprocessor -import deeppavlov.models.preprocessors.capitalization -import deeppavlov.models.preprocessors.dirty_comments_preprocessor -import deeppavlov.models.tokenizers.nltk_tokenizer -import deeppavlov.models.tokenizers.nltk_moses_tokenizer -import deeppavlov.models.tokenizers.spacy_tokenizer -import deeppavlov.models.tokenizers.split_tokenizer -import deeppavlov.models.tokenizers.ru_tokenizer -import deeppavlov.models.squad.squad -import deeppavlov.models.morpho_tagger.tagger -import deeppavlov.models.morpho_tagger.common -import deeppavlov.models.api_requester - -import deeppavlov.skills.go_bot.bot -import deeppavlov.skills.go_bot.network -import deeppavlov.skills.go_bot.tracker -import deeppavlov.skills.seq2seq_go_bot.bot -import deeppavlov.skills.seq2seq_go_bot.network -import deeppavlov.skills.seq2seq_go_bot.kb -import deeppavlov.skills.odqa.tfidf_ranker -import deeppavlov.vocabs.typos -import deeppavlov.vocabs.wiki_sqlite -import deeppavlov.dataset_readers.insurance_reader -import deeppavlov.dataset_iterators.ranking_iterator -import deeppavlov.models.ner.network -import deeppavlov.models.ranking.ranking_model -import deeppavlov.models.ranking.metrics -import deeppavlov.models.preprocessors.char_splitter -import deeppavlov.models.preprocessors.mask -import deeppavlov.models.preprocessors.assemble_embeddins_matrix -import deeppavlov.models.preprocessors.capitalization -import deeppavlov.models.preprocessors.field_getter -import deeppavlov.models.preprocessors.sanitizer -import deeppavlov.models.preprocessors.lazy_tokenizer -import deeppavlov.models.slotfill.slotfill_raw -import deeppavlov.models.slotfill.slotfill -import deeppavlov.models.preprocessors.one_hotter -import deeppavlov.dataset_readers.ontonotes_reader - -import deeppavlov.models.classifiers.tokens_matcher.tokens_matcher - - -import deeppavlov.metrics.accuracy -import deeppavlov.metrics.fmeasure -import deeppavlov.metrics.bleu -import deeppavlov.metrics.squad_metrics -import deeppavlov.metrics.roc_auc_score -import deeppavlov.metrics.fmeasure_classification - -import deeppavlov.core.common.log - -import deeppavlov.download diff --git a/deeppavlov/configs/evolution/evolve_intents_snips.json b/deeppavlov/configs/evolution/evolve_intents_snips.json new file mode 100644 index 0000000000..c34a2a6e5a --- /dev/null +++ b/deeppavlov/configs/evolution/evolve_intents_snips.json @@ -0,0 +1,200 @@ +{ + "dataset_reader": { + "name": "basic_classification_reader", + "x": "text", + "y": "intents", + "data_path": "snips" + }, + "dataset_iterator": { + "name": "basic_classification_iterator", + "seed": { + "evolve_range": [ + 50, + 500 + ], + "discrete": true + }, + "field_to_split": "train", + "split_fields": [ + "train", + "valid" + ], + "split_proportions": [ + 0.9, + 0.1 + ] + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y" + ], + "pipe": [ + { + "id": "classes_vocab", + "name": "default_vocab", + "fit_on": [ + "y" + ], + "level": "token", + "save_path": "vocabs/snips_classes.dict", + "load_path": "vocabs/snips_classes.dict" + }, + { + "in": [ + "x" + ], + "out": [ + "x_lower" + ], + "name": "str_lower" + }, + { + "id": "my_embedder", + "name": "fasttext", + "save_path": "embeddings/dstc2_fastText_model.bin", + "load_path": "embeddings/dstc2_fastText_model.bin", + "dim": 100 + }, + { + "id": "my_tokenizer", + "name": "nltk_tokenizer", + "tokenizer": "wordpunct_tokenize" + }, + { + "in": [ + "x_lower" + ], + "in_y": [ + "y" + ], + "out": [ + "y_labels", + "y_probas_dict" + ], + "main": true, + "name": "intent_model", + "save_path": "evolution/classification/intents_snips", + "load_path": "evolution/classification/intents_snips", + "classes": "#classes_vocab.keys()", + "kernel_sizes_cnn": [ + 1, + 2, + 3 + ], + "filters_cnn": { + "evolve_range": [ + 50, + 100 + ], + "discrete": true + }, + "confident_threshold": { + "evolve_choice": true, + "values": [ + 0.5, + 1 + ] + }, + "optimizer": "Adam", + "lear_rate": { + "evolve_range": [ + 0.0001, + 0.1 + ], + "scale": "log" + }, + "lear_rate_decay": { + "evolve_range": [ + 0.0001, + 0.1 + ], + "scale": "log" + }, + "loss": "binary_crossentropy", + "text_size": 15, + "coef_reg_cnn": { + "evolve_range": [ + 1e-6, + 1e-3 + ] + }, + "coef_reg_den": { + "evolve_range": [ + 1e-6, + 1e-3 + ] + }, + "dropout_rate": { + "evolve_range": [ + 0.1, + 0.9 + ] + }, + "dense_size": { + "evolve_range": [ + 50, + 100 + ], + "discrete": true + }, + "model_name": "cnn_model", + "embedder": "#my_embedder", + "tokenizer": "#my_tokenizer", + "check_bool": { + "evolve_bool": true + } + } + ], + "out": [ + "y_labels", + "y_probas_dict" + ] + }, + "train": { + "epochs": { + "evolve_range": [ + 50, + 500 + ], + "discrete": true + }, + "batch_size": { + "evolve_range": [ + 50, + 500 + ], + "discrete": true + }, + "metrics": [ + "classification_accuracy", + "classification_f1", + "classification_roc_auc" + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "validate_best": true, + "test_best": false + }, + "metadata": { + "labels": { + "telegram_utils": "IntentModel", + "server_utils": "KerasIntentModel" + }, + "download": [ + "http://lnsigo.mipt.ru/export/deeppavlov_data/intents.tar.gz", + "http://lnsigo.mipt.ru/export/deeppavlov_data/vocabs.tar.gz", + { + "url": "http://lnsigo.mipt.ru/export/datasets/snips_intents/train.csv", + "subdir": "snips" + }, + { + "url": "http://lnsigo.mipt.ru/export/deeppavlov_data/embeddings/dstc2_fastText_model.bin", + "subdir": "embeddings" + } + ] + } +} diff --git a/deeppavlov/configs/go_bot/gobot_dstc2.json b/deeppavlov/configs/go_bot/gobot_dstc2.json index 24b5aa599f..43ebea8710 100644 --- a/deeppavlov/configs/go_bot/gobot_dstc2.json +++ b/deeppavlov/configs/go_bot/gobot_dstc2.json @@ -83,6 +83,11 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt", + "../requirements/spacy.txt" + ], "labels": { "telegram_utils": "GoalOrientedBot", "server_utils": "GoalOrientedBot" diff --git a/deeppavlov/configs/go_bot/gobot_dstc2_all.json b/deeppavlov/configs/go_bot/gobot_dstc2_all.json index ec4b86e59d..98078b1b20 100644 --- a/deeppavlov/configs/go_bot/gobot_dstc2_all.json +++ b/deeppavlov/configs/go_bot/gobot_dstc2_all.json @@ -88,6 +88,11 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt", + "../requirements/spacy.txt" + ], "labels": { "telegram_utils": "GoalOrientedBot", "server_utils": "GoalOrientedBot" diff --git a/deeppavlov/configs/go_bot/gobot_dstc2_best.json b/deeppavlov/configs/go_bot/gobot_dstc2_best.json index 49baffb351..c4f1218208 100644 --- a/deeppavlov/configs/go_bot/gobot_dstc2_best.json +++ b/deeppavlov/configs/go_bot/gobot_dstc2_best.json @@ -95,6 +95,11 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt", + "../requirements/spacy.txt" + ], "labels": { "telegram_utils": "GoalOrientedBot", "server_utils": "GoalOrientedBot" diff --git a/deeppavlov/configs/go_bot/gobot_dstc2_minimal.json b/deeppavlov/configs/go_bot/gobot_dstc2_minimal.json index dae0c2b08d..14452a720c 100644 --- a/deeppavlov/configs/go_bot/gobot_dstc2_minimal.json +++ b/deeppavlov/configs/go_bot/gobot_dstc2_minimal.json @@ -67,6 +67,11 @@ ] }, "train": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt", + "../requirements/spacy.txt" + ], "epochs": 200, "batch_size": 4, diff --git a/deeppavlov/configs/intents/intents_dstc2.json b/deeppavlov/configs/intents/intents_dstc2.json index 519d0bb8b2..ae8946b350 100644 --- a/deeppavlov/configs/intents/intents_dstc2.json +++ b/deeppavlov/configs/intents/intents_dstc2.json @@ -63,9 +63,9 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", - "save_path": "intents/intent_cnn_v4", - "load_path": "intents/intent_cnn_v4", + "name": "keras_classification_model", + "save_path": "intents/intents_dstc2_v4", + "load_path": "intents/intents_dstc2_v4", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -107,6 +107,10 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel", "server_utils": "KerasIntentModel" diff --git a/deeppavlov/configs/intents/intents_dstc2_big.json b/deeppavlov/configs/intents/intents_dstc2_big.json index 3fcc7488eb..bdfe85c4dc 100644 --- a/deeppavlov/configs/intents/intents_dstc2_big.json +++ b/deeppavlov/configs/intents/intents_dstc2_big.json @@ -63,9 +63,9 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", - "save_path": "intents/intent_cnn_v5", - "load_path": "intents/intent_cnn_v5", + "name": "keras_classification_model", + "save_path": "intents/intents_dstc2_v5", + "load_path": "intents/intents_dstc2_v5", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -107,6 +107,10 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel" }, diff --git a/deeppavlov/configs/intents/intents_sample_csv.json b/deeppavlov/configs/intents/intents_sample_csv.json index defdc73d9e..e8042aaed5 100644 --- a/deeppavlov/configs/intents/intents_sample_csv.json +++ b/deeppavlov/configs/intents/intents_sample_csv.json @@ -67,9 +67,9 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", - "save_path": "intents/intent_cnn_snips_v4", - "load_path": "intents/intent_cnn_snips_v4", + "name": "keras_classification_model", + "save_path": "intents/intents_snips_v4", + "load_path": "intents/intents_snips_v4", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -113,6 +113,10 @@ "test_best": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel", "server_utils": "KerasIntentModel" diff --git a/deeppavlov/configs/intents/intents_sample_json.json b/deeppavlov/configs/intents/intents_sample_json.json index 5c3e732a2c..b31a98ce66 100644 --- a/deeppavlov/configs/intents/intents_sample_json.json +++ b/deeppavlov/configs/intents/intents_sample_json.json @@ -62,9 +62,9 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", - "save_path": "intents/intent_cnn_snips_v4", - "load_path": "intents/intent_cnn_snips_v4", + "name": "keras_classification_model", + "save_path": "intents/intents_snips_v4", + "load_path": "intents/intents_snips_v4", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -108,6 +108,10 @@ "test_best": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel", "server_utils": "KerasIntentModel" diff --git a/deeppavlov/configs/intents/intents_snips.json b/deeppavlov/configs/intents/intents_snips.json index 573b5aca17..9508d2ef52 100644 --- a/deeppavlov/configs/intents/intents_snips.json +++ b/deeppavlov/configs/intents/intents_snips.json @@ -60,9 +60,9 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", - "save_path": "intents/intent_cnn_snips_v4", - "load_path": "intents/intent_cnn_snips_v4", + "name": "keras_classification_model", + "save_path": "intents/intents_snips_v4", + "load_path": "intents/intents_snips_v4", "classes": "#classes_vocab.keys()", "kernel_sizes_cnn": [ 1, @@ -106,6 +106,10 @@ "test_best": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel", "server_utils": "KerasIntentModel" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_predict.json b/deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_predict.json index 02b0384f3d..75d6ca1d09 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_predict.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_predict.json @@ -64,6 +64,9 @@ "outfile": "results/ud_hu_test.res" }, "metadata": { + "requirements": [ + "../requirements/tf.txt" + ], "download": [ "http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz", { diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_train.json b/deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_train.json index 142c637f34..c9de1b64ca 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_train.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/hu/morpho_hu_train.json @@ -66,6 +66,9 @@ "log_every_n_epochs": 1 }, "metadata": { + "requirements": [ + "../requirements/tf.txt" + ], "download": [ "http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz", { diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_predict.json b/deeppavlov/configs/morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_predict.json index 92c8ff3187..36b575881e 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_predict.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_predict.json @@ -65,6 +65,9 @@ "outfile": "results/ud_ru_syntagrus_test.res" }, "metadata": { + "requirements": [ + "../requirements/tf.txt" + ], "download": [ "http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz", { diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_train.json b/deeppavlov/configs/morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_train.json index 28bb090215..5ee015a4b7 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_train.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/ru_syntagrus/morpho_ru_syntagrus_train.json @@ -66,6 +66,9 @@ "log_every_n_epochs": 1 }, "metadata": { + "requirements": [ + "../requirements/tf.txt" + ], "download": [ "http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz", { diff --git a/deeppavlov/configs/ner/ner_conll2003.json b/deeppavlov/configs/ner/ner_conll2003.json index 6739d49177..93474a85d2 100644 --- a/deeppavlov/configs/ner/ner_conll2003.json +++ b/deeppavlov/configs/ner/ner_conll2003.json @@ -144,6 +144,10 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/gensim.txt", + "../requirements/tf-gpu.txt" + ], "labels": { "telegram_utils": "NERCoNLL2003Model", "server_utils": "NER" diff --git a/deeppavlov/configs/ner/ner_conll2003_pos.json b/deeppavlov/configs/ner/ner_conll2003_pos.json index 89e60faf9d..3bbd8f5c05 100644 --- a/deeppavlov/configs/ner/ner_conll2003_pos.json +++ b/deeppavlov/configs/ner/ner_conll2003_pos.json @@ -162,6 +162,10 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/gensim.txt", + "../requirements/tf-gpu.txt" + ], "labels": { "telegram_utils": "NERCoNLL2003Model", "server_utils": "NER" diff --git a/deeppavlov/configs/ner/ner_dstc2.json b/deeppavlov/configs/ner/ner_dstc2.json index 3c15142bfa..9da047a5dc 100644 --- a/deeppavlov/configs/ner/ner_dstc2.json +++ b/deeppavlov/configs/ner/ner_dstc2.json @@ -91,6 +91,9 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt" + ], "labels": { "telegram_utils": "NERModel", "server_utils": "NER" diff --git a/deeppavlov/configs/ner/ner_ontonotes.json b/deeppavlov/configs/ner/ner_ontonotes.json index e7f102cf19..166b5719ae 100644 --- a/deeppavlov/configs/ner/ner_ontonotes.json +++ b/deeppavlov/configs/ner/ner_ontonotes.json @@ -128,6 +128,10 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/gensim.txt", + "../requirements/tf-gpu.txt" + ], "labels": { "telegram_utils": "NERCoNLL2003Model", "server_utils": "NER" diff --git a/deeppavlov/configs/ner/ner_rus.json b/deeppavlov/configs/ner/ner_rus.json index 4afc3d7c2f..d1010bf405 100644 --- a/deeppavlov/configs/ner/ner_rus.json +++ b/deeppavlov/configs/ner/ner_rus.json @@ -138,9 +138,14 @@ "val_every_n_epochs": 1, "log_every_n_epochs": 1, - "show_examples": false + "show_examples": false, + "tensorboard_log_dir": "ner_rus/logs" }, "metadata": { + "requirements": [ + "../requirements/fasttext.txt", + "../requirements/tf-gpu.txt" + ], "labels": { "telegram_utils": "NERCoNLL2003Model", "server_utils": "NER" diff --git a/deeppavlov/configs/ner/slotfill_dstc2.json b/deeppavlov/configs/ner/slotfill_dstc2.json index 3bc13288a1..3dc7ee4535 100644 --- a/deeppavlov/configs/ner/slotfill_dstc2.json +++ b/deeppavlov/configs/ner/slotfill_dstc2.json @@ -37,6 +37,9 @@ "metrics": ["slots_accuracy"] }, "metadata": { + "requirements": [ + "../requirements/tf.txt" + ], "labels": { "telegram_utils": "NERModel", "server_utils": "DstcSlotFillingNetwork" diff --git a/deeppavlov/configs/ner/slotfill_dstc2_raw.json b/deeppavlov/configs/ner/slotfill_dstc2_raw.json index f0197f1679..80089b9a02 100644 --- a/deeppavlov/configs/ner/slotfill_dstc2_raw.json +++ b/deeppavlov/configs/ner/slotfill_dstc2_raw.json @@ -23,6 +23,9 @@ "out": ["slots"] }, "metadata": { + "requirements": [ + "../requirements/tf.txt" + ], "labels": { "telegram_utils": "NERModel" }, diff --git a/deeppavlov/configs/odqa/en_odqa_infer_prod.json b/deeppavlov/configs/odqa/en_odqa_infer_wiki.json similarity index 80% rename from deeppavlov/configs/odqa/en_odqa_infer_prod.json rename to deeppavlov/configs/odqa/en_odqa_infer_wiki.json index 719c2fe7f2..4f65aaecbf 100644 --- a/deeppavlov/configs/odqa/en_odqa_infer_prod.json +++ b/deeppavlov/configs/odqa/en_odqa_infer_wiki.json @@ -12,7 +12,7 @@ ], "pipe": [ { - "config_path": "../deeppavlov/configs/odqa/en_ranker_prod.json", + "config_path": "../deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json", "in": [ "question_raw" ], @@ -46,10 +46,15 @@ ] }, "metadata": { + "requirements": [ + "../requirements/tf-gpu.txt", + "../requirements/spacy.txt" + ], "labels": { "server_utils": "ODQA" }, "download": [ + "http://lnsigo.mipt.ru/export/datasets/wikipedia/enwiki.tar.gz", "http://lnsigo.mipt.ru/export/deeppavlov_data/en_odqa.tar.gz", "http://lnsigo.mipt.ru/export/deeppavlov_data/squad_model_1.1.tar.gz" ] diff --git a/deeppavlov/configs/odqa/ru_odqa_infer_prod.json b/deeppavlov/configs/odqa/ru_odqa_infer_wiki.json similarity index 83% rename from deeppavlov/configs/odqa/ru_odqa_infer_prod.json rename to deeppavlov/configs/odqa/ru_odqa_infer_wiki.json index 0dd568c206..079ea2e19d 100644 --- a/deeppavlov/configs/odqa/ru_odqa_infer_prod.json +++ b/deeppavlov/configs/odqa/ru_odqa_infer_wiki.json @@ -12,7 +12,7 @@ ], "pipe": [ { - "config_path": "../deeppavlov/configs/odqa/ru_ranker_prod.json", + "config_path": "../deeppavlov/configs/ranking/ru_ranker_tfidf_wiki.json", "in": [ "question_raw" ], @@ -46,10 +46,14 @@ ] }, "metadata": { + "requirements": [ + "../requirements/tf-gpu.txt" + ], "labels": { "server_utils": "ODQA" }, "download": [ + "http://lnsigo.mipt.ru/export/datasets/wikipedia/ruwiki.tar.gz", "http://lnsigo.mipt.ru/export/deeppavlov_data/ru_odqa.tar.gz", "http://lnsigo.mipt.ru/export/deeppavlov_data/squad_model_ru.tar.gz" ] diff --git a/deeppavlov/configs/odqa/en_ranker_prod.json b/deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json similarity index 89% rename from deeppavlov/configs/odqa/en_ranker_prod.json rename to deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json index b12fd56eee..cfc67b9ff9 100644 --- a/deeppavlov/configs/odqa/en_ranker_prod.json +++ b/deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json @@ -51,10 +51,14 @@ "batch_size": 10000 }, "metadata": { + "requirements": [ + "../requirements/spacy.txt" + ], "labels": { "server_utils": "Ranker" }, "download": [ + "http://lnsigo.mipt.ru/export/datasets/wikipedia/enwiki.tar.gz", "http://lnsigo.mipt.ru/export/deeppavlov_data/en_odqa.tar.gz" ] } diff --git a/deeppavlov/configs/ranking/ranking_insurance.json b/deeppavlov/configs/ranking/ranking_insurance.json index d1de282c03..2b4c6e40b7 100644 --- a/deeppavlov/configs/ranking/ranking_insurance.json +++ b/deeppavlov/configs/ranking/ranking_insurance.json @@ -56,6 +56,10 @@ "log_every_n_batches": 10 }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/gensim.txt" + ], "labels": { "telegram_utils": "RankingModel", "server_utils": "Ranker" diff --git a/deeppavlov/configs/odqa/ru_ranker_prod.json b/deeppavlov/configs/ranking/ru_ranker_tfidf_wiki.json similarity index 92% rename from deeppavlov/configs/odqa/ru_ranker_prod.json rename to deeppavlov/configs/ranking/ru_ranker_tfidf_wiki.json index 3b380bc5eb..b9e7df93c6 100644 --- a/deeppavlov/configs/odqa/ru_ranker_prod.json +++ b/deeppavlov/configs/ranking/ru_ranker_tfidf_wiki.json @@ -51,10 +51,12 @@ "batch_size": 10000 }, "metadata": { + "requirements": [], "labels": { "server_utils": "Ranker" }, "download": [ + "http://lnsigo.mipt.ru/export/datasets/wikipedia/ruwiki.tar.gz", "http://lnsigo.mipt.ru/export/deeppavlov_data/ru_odqa.tar.gz" ] } diff --git a/deeppavlov/configs/sentiment/insults_kaggle.json b/deeppavlov/configs/sentiment/insults_kaggle.json index 82eaf6bc36..5d8a746bfb 100644 --- a/deeppavlov/configs/sentiment/insults_kaggle.json +++ b/deeppavlov/configs/sentiment/insults_kaggle.json @@ -60,7 +60,7 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "sentiment/insults_kaggle_v0", "load_path": "sentiment/insults_kaggle_v0", "classes": "#classes_vocab.keys()", @@ -107,6 +107,10 @@ "test_best": true }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel", "server_utils": "KerasIntentModel" diff --git a/deeppavlov/configs/sentiment/sentiment_ag_news.json b/deeppavlov/configs/sentiment/sentiment_ag_news.json index 897111dba7..5c18570463 100644 --- a/deeppavlov/configs/sentiment/sentiment_ag_news.json +++ b/deeppavlov/configs/sentiment/sentiment_ag_news.json @@ -59,7 +59,7 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "sentiment/sentiment_ag_news_v0", "load_path": "sentiment/sentiment_ag_news_v0", "classes": "#classes_vocab.keys()", @@ -106,6 +106,10 @@ "test_best": true }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel", "server_utils": "KerasIntentModel" diff --git a/deeppavlov/configs/sentiment/sentiment_twitter.json b/deeppavlov/configs/sentiment/sentiment_twitter.json index df36bf3b38..995ed8e5b4 100644 --- a/deeppavlov/configs/sentiment/sentiment_twitter.json +++ b/deeppavlov/configs/sentiment/sentiment_twitter.json @@ -60,7 +60,7 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "sentiment/sentiment_twitter_v1", "load_path": "sentiment/sentiment_twitter_v1", "classes": "#classes_vocab.keys()", @@ -107,6 +107,10 @@ "test_best": true }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel", "server_utils": "KerasIntentModel" diff --git a/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json b/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json index c61470632b..69e9d27e03 100644 --- a/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json +++ b/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json @@ -107,6 +107,10 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/spacy.txt" + ], "labels": { "telegram_utils": "Seq2SeqGoalOrientedBot", "server_utils": "GoalOrientedBot" diff --git a/deeppavlov/configs/seq2seq_go_bot/bot_kvret_infer.json b/deeppavlov/configs/seq2seq_go_bot/bot_kvret_infer.json index 269a3fefeb..a03f33644a 100644 --- a/deeppavlov/configs/seq2seq_go_bot/bot_kvret_infer.json +++ b/deeppavlov/configs/seq2seq_go_bot/bot_kvret_infer.json @@ -82,6 +82,10 @@ "show_examples": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/spacy.txt" + ], "labels": { "telegram_utils": "Seq2SeqGoalOrientedBot", "server_utils": "GoalOrientedBot" diff --git a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json index da62bff8a1..f92ba7d22a 100644 --- a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json +++ b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json @@ -59,6 +59,9 @@ "test_best": true }, "metadata": { + "requirements": [ + "../requirements/spelling.txt" + ], "labels": { "telegram_utils": "ErrorModel", "server_utils": "ErrorModel" diff --git a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json index cc69ec0aa9..884800987e 100644 --- a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json +++ b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json @@ -61,6 +61,9 @@ "test_best": true }, "metadata": { + "requirements": [ + "../requirements/spelling.txt" + ], "labels": { "telegram_utils": "ErrorModel", "server_utils": "ErrorModel" diff --git a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_nolm.json b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_nolm.json index b722be5021..22b2c7b1c8 100644 --- a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_nolm.json +++ b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_nolm.json @@ -58,6 +58,9 @@ "test_best": true }, "metadata": { + "requirements": [ + "../requirements/spelling.txt" + ], "labels": { "telegram_utils": "ErrorModel", "server_utils": "ErrorModel" diff --git a/deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json b/deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json index 16ebbad906..5b801ea844 100644 --- a/deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json +++ b/deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json @@ -58,6 +58,9 @@ "test_best": true }, "metadata": { + "requirements": [ + "../requirements/spelling.txt" + ], "labels": { "telegram_utils": "ErrorModel", "server_utils": "ErrorModel" diff --git a/deeppavlov/configs/spelling_correction/levenstein_corrector_ru.json b/deeppavlov/configs/spelling_correction/levenstein_corrector_ru.json index fe03fd1584..73b562cdcb 100644 --- a/deeppavlov/configs/spelling_correction/levenstein_corrector_ru.json +++ b/deeppavlov/configs/spelling_correction/levenstein_corrector_ru.json @@ -41,6 +41,9 @@ "out": ["y_predicted"] }, "metadata": { + "requirements": [ + "../requirements/spelling.txt" + ], "labels": { "telegram_utils": "ErrorModel", "server_utils": "ErrorModel" diff --git a/deeppavlov/configs/squad/squad.json b/deeppavlov/configs/squad/squad.json index 87b3d5f4e3..96c2f78a44 100644 --- a/deeppavlov/configs/squad/squad.json +++ b/deeppavlov/configs/squad/squad.json @@ -104,6 +104,9 @@ "metrics": ["squad_f1", "exact_match"] }, "metadata": { + "requirements": [ + "../requirements/tf-gpu.txt" + ], "labels": { "telegram_utils": "SquadModel", "server_utils": "SquadModel" diff --git a/deeppavlov/configs/squad/squad_ru.json b/deeppavlov/configs/squad/squad_ru.json index d97230e8a9..4501ab9846 100644 --- a/deeppavlov/configs/squad/squad_ru.json +++ b/deeppavlov/configs/squad/squad_ru.json @@ -105,6 +105,9 @@ "metrics": ["squad_f1", "exact_match"] }, "metadata": { + "requirements": [ + "../requirements/tf-gpu.txt" + ], "labels": { "telegram_utils": "SquadModel", "server_utils": "SquadModel" diff --git a/deeppavlov/core/commands/infer.py b/deeppavlov/core/commands/infer.py index bc035b4296..53b5b956d2 100644 --- a/deeppavlov/core/commands/infer.py +++ b/deeppavlov/core/commands/infer.py @@ -13,10 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. """ -from deeppavlov.core.commands.utils import set_deeppavlov_root +from deeppavlov.core.commands.utils import set_deeppavlov_root, import_packages from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.common.file import read_json -from deeppavlov.core.common.registry import REGISTRY from deeppavlov.core.agent.agent import Agent from deeppavlov.core.common.params import from_params @@ -28,6 +27,9 @@ def build_model_from_config(config, mode='infer', load_trained=False, as_component=False): set_deeppavlov_root(config) + + import_packages(config.get('metadata', {}).get('imports', [])) + model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'), as_component=as_component) diff --git a/deeppavlov/core/commands/train.py b/deeppavlov/core/commands/train.py index 6e3c0a0561..efc3e5c4f2 100644 --- a/deeppavlov/core/commands/train.py +++ b/deeppavlov/core/commands/train.py @@ -22,12 +22,12 @@ from pathlib import Path from typing import List, Callable, Tuple, Dict, Union -from deeppavlov.core.commands.utils import expand_path, set_deeppavlov_root +from deeppavlov.core.commands.utils import expand_path, set_deeppavlov_root, import_packages from deeppavlov.core.commands.infer import build_model_from_config from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.file import read_json -from deeppavlov.core.common.registry import model as get_model +from deeppavlov.core.common.registry import get_model from deeppavlov.core.common.metrics_registry import get_metrics_by_names from deeppavlov.core.common.params import from_params from deeppavlov.core.data.data_learning_iterator import DataLearningIterator @@ -101,6 +101,8 @@ def train_evaluate_model_from_config(config: [str, Path, dict], to_train=True, t config = read_json(config) set_deeppavlov_root(config) + import_packages(config.get('metadata', {}).get('imports', [])) + dataset_config = config.get('dataset', None) if dataset_config: @@ -231,7 +233,8 @@ def _train_batches(model: NNModel, iterator: DataLearningIterator, train_config: # 'show_examples': False, 'validate_best': True, - 'test_best': True + 'test_best': True, + 'tensorboard_log_dir': None, } train_config = dict(default_train_config, **train_config) @@ -258,6 +261,14 @@ def improved(score, best): losses = [] start_time = time.time() break_flag = False + + if train_config['tensorboard_log_dir'] is not None: + import tensorflow as tf + tb_log_dir = expand_path(train_config['tensorboard_log_dir']) + + tb_train_writer = tf.summary.FileWriter(str(tb_log_dir / 'train_log')) + tb_valid_writer = tf.summary.FileWriter(str(tb_log_dir / 'valid_log')) + try: while True: for x, y_true in iterator.gen_batches(train_config['batch_size']): @@ -280,10 +291,23 @@ def improved(score, best): 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } + if losses: report['loss'] = sum(losses)/len(losses) losses = [] report = {'train': report} + + if train_config['tensorboard_log_dir'] is not None: + for name, score in metrics: + metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_batches/' + name, + simple_value=score), ]) + tb_train_writer.add_summary(metric_sum, i) + + if losses: + loss_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_batches/' + 'loss', + simple_value=report['loss']), ]) + tb_train_writer.add_summary(loss_sum, i) + print(json.dumps(report, ensure_ascii=False)) train_y_true.clear() train_y_predicted.clear() @@ -322,6 +346,21 @@ def improved(score, best): 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } + if losses: + report['loss'] = sum(losses)/len(losses) + losses = [] + + if train_config['tensorboard_log_dir'] is not None: + for name, score in metrics: + metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + name, + simple_value=score), ]) + tb_train_writer.add_summary(metric_sum, epochs) + + if losses: + loss_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + 'loss', + simple_value=report['loss']), ]) + tb_train_writer.add_summary(loss_sum, epochs) + model.process_event(event_name='after_train_log', data=report) report = {'train': report} print(json.dumps(report, ensure_ascii=False)) @@ -337,6 +376,12 @@ def improved(score, best): metrics = list(report['metrics'].items()) + if train_config['tensorboard_log_dir'] is not None: + for name, score in metrics: + metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + name, + simple_value=score), ]) + tb_valid_writer.add_summary(metric_sum, epochs) + m_name, score = metrics[0] if improved(score, best): patience = 0 diff --git a/deeppavlov/core/commands/utils.py b/deeppavlov/core/commands/utils.py index e0c40bf18d..d7550f6fae 100644 --- a/deeppavlov/core/commands/utils.py +++ b/deeppavlov/core/commands/utils.py @@ -50,3 +50,8 @@ def is_empty(d: Path) -> bool: Check if directory is empty. """ return not bool(list(d.iterdir())) + + +def import_packages(packages: list): + for package in packages: + __import__(package) diff --git a/deeppavlov/core/common/metrics_registry.json b/deeppavlov/core/common/metrics_registry.json new file mode 100644 index 0000000000..7936c400e9 --- /dev/null +++ b/deeppavlov/core/common/metrics_registry.json @@ -0,0 +1,25 @@ +{ + "accuracy": "deeppavlov.metrics.accuracy:accuracy", + "bleu": "deeppavlov.metrics.bleu:bleu", + "classification_accuracy": "deeppavlov.metrics.accuracy:classification_accuracy", + "classification_f1": "deeppavlov.metrics.fmeasure_classification:fmeasure", + "classification_f1_weighted": "deeppavlov.metrics.fmeasure_classification:fmeasure", + "classification_log_loss": "deeppavlov.metrics.log_loss:classification_log_loss", + "classification_mrr": "deeppavlov.metrics.mrr_classification:mrr_score", + "classification_roc_auc": "deeppavlov.metrics.roc_auc_score:roc_auc_score", + "exact_match": "deeppavlov.metrics.squad_metrics:exact_match", + "loss": "deeppavlov.models.ranking.metrics:triplet_loss", + "ner_f1": "deeppavlov.metrics.fmeasure:ner_f1", + "per_item_accuracy": "deeppavlov.metrics.accuracy:per_item_accuracy", + "per_item_bleu": "deeppavlov.metrics.bleu:per_item_bleu", + "per_item_dialog_accuracy": "deeppavlov.metrics.accuracy:per_item_dialog_accuracy", + "per_item_dialog_bleu": "deeppavlov.metrics.bleu:per_item_dialog_bleu", + "per_token_accuracy": "deeppavlov.metrics.accuracy:per_token_accuracy", + "r@1": "deeppavlov.models.ranking.metrics:r_at_1", + "r@2": "deeppavlov.models.ranking.metrics:r_at_2", + "r@5": "deeppavlov.models.ranking.metrics:r_at_5", + "rank_response": "deeppavlov.models.ranking.metrics:rank_response", + "sets_accuracy": "deeppavlov.metrics.accuracy:sets_accuracy", + "slots_accuracy": "deeppavlov.metrics.accuracy:slots_accuracy", + "squad_f1": "deeppavlov.metrics.squad_metrics:squad_f1" +} \ No newline at end of file diff --git a/deeppavlov/core/common/metrics_registry.py b/deeppavlov/core/common/metrics_registry.py index c9b8253d99..add6be0c42 100644 --- a/deeppavlov/core/common/metrics_registry.py +++ b/deeppavlov/core/common/metrics_registry.py @@ -1,19 +1,39 @@ +import importlib +from pathlib import Path +import json + from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.log import get_logger log = get_logger(__name__) -_REGISTRY = {} +_registry_path = Path(__file__).parent / 'metrics_registry.json' +if _registry_path.exists(): + with _registry_path.open(encoding='utf-8') as f: + _REGISTRY = json.load(f) +else: + _REGISTRY = {} + + +def fn_from_str(name: str) -> type: + try: + module_name, fn_name = name.split(':') + except ValueError: + raise ConfigError('Expected function description in a `module.submodules:function_name` form, but got `{}`' + .format(name)) + + return getattr(importlib.import_module(module_name), fn_name) def register_metric(metric_name): - def decorate(f): - if metric_name in _REGISTRY: + def decorate(fn): + fn_name = fn.__module__ + ':' + fn.__name__ + if metric_name in _REGISTRY and _REGISTRY[metric_name] != fn_name: log.warning('"{}" is already registered as a metric name, the old function will be ignored' .format(metric_name)) - _REGISTRY[metric_name] = f - return f + _REGISTRY[metric_name] = fn_name + return fn return decorate @@ -21,4 +41,4 @@ def get_metrics_by_names(names: list): not_found = [name for name in names if name not in _REGISTRY] if not_found: raise ConfigError('Names {} are not registered as metrics'.format(not_found)) - return [_REGISTRY[name] for name in names] + return [fn_from_str(_REGISTRY[name]) for name in names] diff --git a/deeppavlov/core/common/params.py b/deeppavlov/core/common/params.py index faac8a9c02..6b3e2e3ce1 100644 --- a/deeppavlov/core/common/params.py +++ b/deeppavlov/core/common/params.py @@ -19,7 +19,7 @@ from deeppavlov.core.commands.utils import expand_path, get_deeppavlov_root, set_deeppavlov_root from deeppavlov.core.common.file import read_json -from deeppavlov.core.common.registry import REGISTRY +from deeppavlov.core.common.registry import get_model, cls_from_str from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.log import get_logger from deeppavlov.core.models.component import Component @@ -84,27 +84,14 @@ def from_params(params: Dict, mode='infer', **kwargs) -> Component: return model elif 'class' in config_params: - c = config_params.pop('class') - try: - module_name, cls_name = c.split(':') - cls = getattr(importlib.import_module(module_name), cls_name) - except ValueError: - e = ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`' - .format(c)) - log.exception(e) - raise e + cls = cls_from_str(config_params.pop('class')) else: cls_name = config_params.pop('name', None) if not cls_name: e = ConfigError('Component config has no `name` nor `ref` or `class` fields') log.exception(e) raise e - try: - cls = REGISTRY[cls_name] - except KeyError: - e = ConfigError('Class {} is not registered.'.format(cls_name)) - log.exception(e) - raise e + cls = get_model(cls_name) # find the submodels params recursively config_params = {k: _init_param(v, mode) for k, v in config_params.items()} diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json new file mode 100644 index 0000000000..0c8673bb10 --- /dev/null +++ b/deeppavlov/core/common/registry.json @@ -0,0 +1,93 @@ +{ + "api_requester": "deeppavlov.models.api_requester.api_requester:ApiRequester", + "api_router": "deeppavlov.models.api_requester.api_router:ApiRouter", + "babi_reader": "deeppavlov.dataset_readers.babi_reader:BabiDatasetReader", + "basic_classification_iterator": "deeppavlov.dataset_iterators.basic_classification_iterator:BasicClassificationDatasetIterator", + "basic_classification_reader": "deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader", + "bow": "deeppavlov.models.embedders.bow_embedder:BoWEmbedder", + "capitalization_featurizer": "deeppavlov.models.preprocessors.capitalization:CapitalizationPreprocessor", + "char_splitter": "deeppavlov.models.preprocessors.char_splitter:CharSplitter", + "char_vocab": "deeppavlov.core.data.simple_vocab:CharacterVocab", + "conll2003_reader": "deeppavlov.dataset_readers.conll2003_reader:Conll2003DatasetReader", + "data_fitting_iterator": "deeppavlov.core.data.data_fitting_iterator:DataFittingIterator", + "data_learning_iterator": "deeppavlov.core.data.data_learning_iterator:DataLearningIterator", + "default_tracker": "deeppavlov.models.trackers.default_tracker:DefaultTracker", + "default_vocab": "deeppavlov.core.data.vocab:DefaultVocabulary", + "dialog_db_result_iterator": "deeppavlov.dataset_iterators.dialog_iterator:DialogDBResultDatasetIterator", + "dialog_iterator": "deeppavlov.dataset_iterators.dialog_iterator:DialogDatasetIterator", + "dialog_vocab": "deeppavlov.core.data.simple_vocab:DialogVocab", + "dict_emb": "deeppavlov.models.embedders.dict_embedder:DictEmbedder", + "dirty_comments_preprocessor": "deeppavlov.models.preprocessors.dirty_comments_preprocessor:DirtyCommentsPreprocessor", + "dstc2_intents_iterator": "deeppavlov.dataset_iterators.dstc2_intents_iterator:Dstc2IntentsDatasetIterator", + "dstc2_ner_iterator": "deeppavlov.dataset_iterators.dstc2_ner_iterator:Dstc2NerDatasetIterator", + "dstc2_reader": "deeppavlov.dataset_readers.dstc2_reader:DSTC2DatasetReader", + "dstc2_v2_reader": "deeppavlov.dataset_readers.dstc2_reader:DSTC2Version2DatasetReader", + "dstc_slotfilling": "deeppavlov.models.slotfill.slotfill:DstcSlotFillingNetwork", + "emb_mat_assembler": "deeppavlov.models.preprocessors.assemble_embeddins_matrix:EmbeddingsMatrixAssembler", + "fasttext": "deeppavlov.models.embedders.fasttext_embedder:FasttextEmbedder", + "featurized_tracker": "deeppavlov.models.go_bot.tracker:FeaturizedTracker", + "field_getter": "deeppavlov.models.preprocessors.field_getter:FieldGetter", + "glove": "deeppavlov.models.embedders.glove_embedder:GloVeEmbedder", + "go_bot": "deeppavlov.models.go_bot.bot:GoalOrientedBot", + "go_bot_rnn": "deeppavlov.models.go_bot.network:GoalOrientedBotNetwork", + "hashing_tfidf_vectorizer": "deeppavlov.models.vectorizers.hashing_tfidf_vectorizer:HashingTfIdfVectorizer", + "hcn_at": "deeppavlov.models.trackers.hcn_at:ActionTracker", + "hcn_et": "deeppavlov.models.trackers.hcn_et:EntityTracker", + "insurance_reader": "deeppavlov.dataset_readers.insurance_reader:InsuranceReader", + "kenlm_elector": "deeppavlov.models.spelling_correction.electors.kenlm_elector:KenlmElector", + "keras_classification_model": "deeppavlov.models.classifiers.keras_classification_model:KerasClassificationModel", + "knowledge_base": "deeppavlov.models.seq2seq_go_bot.kb:KnowledgeBase", + "knowledge_base_entity_normalizer": "deeppavlov.models.seq2seq_go_bot.kb:KnowledgeBaseEntityNormalizer", + "kvret_dialog_iterator": "deeppavlov.dataset_iterators.kvret_dialog_iterator:KvretDialogDatasetIterator", + "kvret_reader": "deeppavlov.dataset_readers.kvret_reader:KvretDatasetReader", + "lazy_tokenizer": "deeppavlov.models.preprocessors.lazy_tokenizer:LazyTokenizer", + "lowercase_preprocessor": "deeppavlov.models.preprocessors.capitalization:LowercasePreprocessor", + "mask": "deeppavlov.models.preprocessors.mask:Mask", + "morpho_tagger": "deeppavlov.models.morpho_tagger.tagger:MorphoTaggerWrapper", + "morphotagger_dataset": "deeppavlov.dataset_iterators.morphotagger_iterator:MorphoTaggerDatasetIterator", + "morphotagger_dataset_reader": "deeppavlov.dataset_readers.morphotagging_dataset_reader:MorphotaggerDatasetReader", + "ner": "deeppavlov.models.ner.network:NerNetwork", + "nltk_moses_tokenizer": "deeppavlov.models.tokenizers.nltk_moses_tokenizer:NLTKTokenizer", + "nltk_tokenizer": "deeppavlov.models.tokenizers.nltk_tokenizer:NLTKTokenizer", + "one_hotter": "deeppavlov.models.preprocessors.one_hotter:OneHotter", + "ontonotes_reader": "deeppavlov.dataset_readers.ontonotes_reader:OntonotesReader", + "params_evolution": "deeppavlov.models.evolution.evolution_param_generator:ParamsEvolution", + "pymorphy_russian_lemmatizer": "deeppavlov.models.preprocessors.russian_lemmatizer:PymorphyRussianLemmatizer", + "random": "deeppavlov.models.commutators.random_commutator:RandomCommutator", + "random_emb_mat": "deeppavlov.models.preprocessors.assemble_embeddins_matrix:RandomEmbeddingsMatrix", + "ranking_iterator": "deeppavlov.dataset_iterators.ranking_iterator:RankingIterator", + "ranking_model": "deeppavlov.models.ranking.ranking_model:RankingModel", + "ru_sent_tokenizer": "deeppavlov.models.tokenizers.ru_sent_tokenizer:RuSentTokenizer", + "ru_tokenizer": "deeppavlov.models.tokenizers.ru_tokenizer:RussianTokenizer", + "russian_words_vocab": "deeppavlov.vocabs.typos:RussianWordsVocab", + "sanitizer": "deeppavlov.models.preprocessors.sanitizer:Sanitizer", + "seq2seq_go_bot": "deeppavlov.models.seq2seq_go_bot.bot:Seq2SeqGoalOrientedBot", + "seq2seq_go_bot_nn": "deeppavlov.models.seq2seq_go_bot.network:Seq2SeqGoalOrientedBotNetwork", + "simple_vocab": "deeppavlov.core.data.simple_vocab:SimpleVocabulary", + "slotfill_raw": "deeppavlov.models.slotfill.slotfill_raw:SlotFillingComponent", + "spelling_error_model": "deeppavlov.models.spelling_correction.brillmoore.error_model:ErrorModel", + "spelling_levenstein": "deeppavlov.models.spelling_correction.levenstein.searcher_component:LevensteinSearcherComponent", + "split_tokenizer": "deeppavlov.models.tokenizers.split_tokenizer:SplitTokenizer", + "sqlite_database": "deeppavlov.core.data.sqlite_database:Sqlite3Database", + "sqlite_iterator": "deeppavlov.dataset_iterators.sqlite_iterator:SQLiteDataIterator", + "squad_ans_postprocessor": "deeppavlov.models.preprocessors.squad_preprocessor:SquadAnsPostprocessor", + "squad_ans_preprocessor": "deeppavlov.models.preprocessors.squad_preprocessor:SquadAnsPreprocessor", + "squad_dataset_reader": "deeppavlov.dataset_readers.squad_dataset_reader:SquadDatasetReader", + "squad_iterator": "deeppavlov.dataset_iterators.squad_iterator:SquadIterator", + "squad_model": "deeppavlov.models.squad.squad:SquadModel", + "squad_preprocessor": "deeppavlov.models.preprocessors.squad_preprocessor:SquadPreprocessor", + "squad_vocab_embedder": "deeppavlov.models.preprocessors.squad_preprocessor:SquadVocabEmbedder", + "static_dictionary": "deeppavlov.vocabs.typos:StaticDictionary", + "str_lower": "deeppavlov.models.preprocessors.str_lower:StrLower", + "stream_spacy_tokenizer": "deeppavlov.models.tokenizers.spacy_tokenizer:StreamSpacyTokenizer", + "tag_output_prettifier": "deeppavlov.models.morpho_tagger.common:TagOutputPrettifier", + "tfidf_ranker": "deeppavlov.models.ranking.tfidf_ranker:TfidfRanker", + "tokens_matcher": "deeppavlov.models.classifiers.tokens_matcher.tokens_matcher:TokensMatcher", + "top1_elector": "deeppavlov.models.spelling_correction.electors.top1_elector:TopOneElector", + "typos_custom_reader": "deeppavlov.dataset_readers.typos_reader:TyposCustom", + "typos_iterator": "deeppavlov.dataset_iterators.typos_iterator:TyposDatasetIterator", + "typos_kartaslov_reader": "deeppavlov.dataset_readers.typos_reader:TyposKartaslov", + "typos_wikipedia_reader": "deeppavlov.dataset_readers.typos_reader:TyposWikipedia", + "wiki_sqlite_vocab": "deeppavlov.vocabs.wiki_sqlite:WikiSQLiteVocab", + "wikitionary_100K_vocab": "deeppavlov.vocabs.typos:Wiki100KDictionary" +} \ No newline at end of file diff --git a/deeppavlov/core/common/registry.py b/deeppavlov/core/common/registry.py index 028ee42ae0..296d439553 100644 --- a/deeppavlov/core/common/registry.py +++ b/deeppavlov/core/common/registry.py @@ -13,53 +13,58 @@ See the License for the specific language governing permissions and limitations under the License. """ +import importlib +import json +from pathlib import Path -"""Registry for models. Create your models by subclassing one of the abstract model classes (RBModel -, SModel, TModel) and register it. You can assign a code name to the model in the decorator function -parentheses or leave them blank, in the last case the class name will be assigned automatically. -The name should repeat itself in your pipeline json configuration. +from deeppavlov.core.common.log import get_logger +from deeppavlov.core.common.errors import ConfigError -Example: - @registry.register_model('my_model') - class MyModel(TModel) -Note that you should import _REGISTRY variable and all your custom models in the entry point of -your training/inference script. -""" +logger = get_logger(__name__) -from typing import Type, List +_registry_path = Path(__file__).parent / 'registry.json' +if _registry_path.exists(): + with _registry_path.open(encoding='utf-8') as f: + _REGISTRY = json.load(f) +else: + _REGISTRY = {} -from deeppavlov.core.common.log import get_logger -from deeppavlov.core.common.errors import ConfigError -logger = get_logger(__name__) +def cls_from_str(name: str) -> type: + try: + module_name, cls_name = name.split(':') + except ValueError: + raise ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`' + .format(name)) -REGISTRY = {} + return getattr(importlib.import_module(module_name), cls_name) -def register(name: str = None) -> Type: +def register(name: str = None) -> type: """Register model. If name is not passed, the model class name is converted to snake-case.""" - def decorate(model_cls: Type, reg_name: str = None) -> Type: + def decorate(model_cls: type, reg_name: str = None) -> type: model_name = reg_name or short_name(model_cls) - global REGISTRY - if model_name in REGISTRY: + global _REGISTRY + cls_name = model_cls.__module__ + ':' + model_cls.__name__ + if model_name in _REGISTRY and _REGISTRY[model_name] != cls_name: logger.warning('Registry name "{}" has been already registered and will be overwritten.'.format(model_name)) - REGISTRY[model_name] = model_cls + _REGISTRY[model_name] = cls_name return model_cls return lambda model_cls_name: decorate(model_cls_name, name) -def short_name(cls: Type) -> str: +def short_name(cls: type) -> str: return cls.__name__.split('.')[-1] -def model(name: str) -> type: - if name not in REGISTRY: +def get_model(name: str) -> type: + if name not in _REGISTRY: raise ConfigError("Model {} is not registered.".format(name)) - return REGISTRY[name] + return cls_from_str(_REGISTRY[name]) -def list_models() -> List: - return list(REGISTRY) +def list_models() -> list: + return list(_REGISTRY) diff --git a/deeppavlov/core/layers/keras_layers.py b/deeppavlov/core/layers/keras_layers.py index 710156df53..3439f69f51 100644 --- a/deeppavlov/core/layers/keras_layers.py +++ b/deeppavlov/core/layers/keras_layers.py @@ -99,3 +99,24 @@ def multiplicative_self_attention(units, n_hidden=None, n_output_features=None, attended_units = Lambda(lambda x: K.sum(x, axis=2))(mult) output = Dense(n_output_features, activation=activation)(attended_units) return output + + +def multiplicative_self_attention_init(n_hidden, n_output_features, activation): + layers = {} + layers["queries"] = Dense(n_hidden) + layers["keys"] = Dense(n_hidden) + layers["output"] = Dense(n_output_features, activation=activation) + return layers + + +def multiplicative_self_attention_get_output(units, layers): + exp1 = Lambda(lambda x: expand_tile(x, axis=1))(units) + exp2 = Lambda(lambda x: expand_tile(x, axis=2))(units) + queries = layers["queries"](exp1) + keys = layers["keys"](exp2) + scores = Lambda(lambda x: K.sum(queries * x, axis=3, keepdims=True))(keys) + attention = Lambda(lambda x: softvaxaxis2(x))(scores) + mult = Multiply()([attention, exp1]) + attended_units = Lambda(lambda x: K.sum(x, axis=2))(mult) + output = layers["output"](attended_units) + return output diff --git a/deeppavlov/core/models/keras_model.py b/deeppavlov/core/models/keras_model.py index 33936d0bff..a0462e4627 100644 --- a/deeppavlov/core/models/keras_model.py +++ b/deeppavlov/core/models/keras_model.py @@ -55,6 +55,9 @@ def __init__(self, **kwargs): load_path = self.opt.get('load_path', None) url = self.opt.get('url', None) self.model = None + self.epochs_done = 0 + self.batches_seen = 0 + self.train_examples_seen = 0 super().__init__(save_path=save_path, load_path=load_path, @@ -100,13 +103,13 @@ def init_model_from_scratch(self, model_name, optimizer_name, loss_name, lear_ra if callable(optimizer_func): if not(lear_rate is None): if not(lear_rate_decay is None): - optimizer_ = optimizer_func(lr=lear_rate, decay=lear_rate_decay) + self.optimizer = optimizer_func(lr=lear_rate, decay=lear_rate_decay) else: - optimizer_ = optimizer_func(lr=lear_rate) + self.optimizer = optimizer_func(lr=lear_rate) elif not(lear_rate_decay is None): - optimizer_ = optimizer_func(decay=lear_rate_decay) + self.optimizer = optimizer_func(decay=lear_rate_decay) else: - optimizer_ = optimizer_func() + self.optimizer = optimizer_func() else: raise AttributeError("Optimizer {} is not defined in `keras.optimizers`".format(optimizer_name)) @@ -116,7 +119,7 @@ def init_model_from_scratch(self, model_name, optimizer_name, loss_name, lear_ra else: raise AttributeError("Loss {} is not defined in `keras.losses`".format(loss_name)) - model.compile(optimizer=optimizer_, loss=loss) + model.compile(optimizer=self.optimizer, loss=loss) return model @overrides @@ -136,7 +139,7 @@ def load(self, model_name, optimizer_name, loss_name, lear_rate=None, lear_rate_ """ if self.load_path: if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): - raise ConfigError("Provided save path is incorrect!") + raise ConfigError("Provided load path is incorrect!") opt_path = Path("{}_opt.json".format(str(self.load_path.resolve()))) weights_path = Path("{}.h5".format(str(self.load_path.resolve()))) @@ -160,13 +163,13 @@ def load(self, model_name, optimizer_name, loss_name, lear_rate=None, lear_rate_ if callable(optimizer_func): if not (lear_rate is None): if not (lear_rate_decay is None): - optimizer_ = optimizer_func(lr=lear_rate, decay=lear_rate_decay) + self.optimizer = optimizer_func(lr=lear_rate, decay=lear_rate_decay) else: - optimizer_ = optimizer_func(lr=lear_rate) + self.optimizer = optimizer_func(lr=lear_rate) elif not (lear_rate_decay is None): - optimizer_ = optimizer_func(decay=lear_rate_decay) + self.optimizer = optimizer_func(decay=lear_rate_decay) else: - optimizer_ = optimizer_func() + self.optimizer = optimizer_func() else: raise AttributeError("Optimizer {} is not defined in `keras.optimizers`".format(optimizer_name)) @@ -176,7 +179,7 @@ def load(self, model_name, optimizer_name, loss_name, lear_rate=None, lear_rate_ else: raise AttributeError("Loss {} is not defined".format(loss_name)) - model.compile(optimizer=optimizer_, + model.compile(optimizer=self.optimizer, loss=loss) return model else: @@ -211,6 +214,10 @@ def save(self, fname=None): # if model was loaded from one path and saved to another one # then change load_path to save_path for config + self.opt["epochs_done"] = self.epochs_done + self.opt["final_lear_rate"] = K.eval(self.optimizer.lr) / (1. + + K.eval(self.optimizer.decay) * self.batches_seen) + if self.opt.get("load_path") and self.opt.get("save_path"): if self.opt.get("save_path") != self.opt.get("load_path"): self.opt["load_path"] = str(self.opt["save_path"]) @@ -239,3 +246,19 @@ def mlp(self, opt): @abstractmethod def reset(self): pass + + def process_event(self, event_name, data): + """ + Process event after epoch + Args: + event_name: whether event is send after epoch or batch + data: event data (dictionary) + + Returns: + None + """ + if event_name == "after_epoch": + self.epochs_done = data["epochs_done"] + self.batches_seen = data["batches_seen"] + self.train_examples_seen = data["train_examples_seen"] + return diff --git a/deeppavlov/dataset_iterators/ranking_iterator.py b/deeppavlov/dataset_iterators/ranking_iterator.py index 35d4178aa3..2b05bb081f 100644 --- a/deeppavlov/dataset_iterators/ranking_iterator.py +++ b/deeppavlov/dataset_iterators/ranking_iterator.py @@ -1,15 +1,16 @@ from deeppavlov.core.common.registry import register +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator import numpy as np @register('ranking_iterator') -class RankingIterator: +class RankingIterator(DataLearningIterator): def __init__(self, data, len_vocab, sample_candidates, sample_candidates_valid, sample_candidates_test, num_negative_samples, num_ranking_samples_valid, num_ranking_samples_test, - seed=None): + shuffle=False, seed=None): self.len_vocab = len_vocab self.sample_candidates = sample_candidates self.sample_candidates_valid = sample_candidates_valid @@ -29,6 +30,9 @@ def __init__(self, data, len_vocab, 'all': self.train + self.test + self.valid } + super().__init__(self.data, seed=seed, shuffle=shuffle) + + def gen_batches(self, batch_size, data_type="train", shuffle=True): y = batch_size * [np.ones(2)] data = self.data[data_type] diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index f567fe2add..6d95eca276 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -81,7 +81,10 @@ def read(self, data_path, url=None, *args, **kwargs): x = kwargs.get("x", "text") y = kwargs.get('y', 'labels') class_sep = kwargs.get('class_sep', ',') - data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] + if isinstance(x, list): + data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] + else: + data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: log.warning("Cannot find {} file".format(file)) diff --git a/deeppavlov/deep.py b/deeppavlov/deep.py index 826125cbb2..ab8fd267c7 100644 --- a/deeppavlov/deep.py +++ b/deeppavlov/deep.py @@ -28,6 +28,7 @@ from deeppavlov.download import deep_download from utils.telegram_utils.telegram_ui import interact_model_by_telegram from utils.server_utils.server import start_model_server +from utils.pip_wrapper import install_from_config log = get_logger(__name__) @@ -35,7 +36,8 @@ parser = argparse.ArgumentParser() parser.add_argument("mode", help="select a mode, train or interact", type=str, - choices={'train', 'evaluate', 'interact', 'predict', 'interactbot', 'riseapi', 'download'}) + choices={'train', 'evaluate', 'interact', 'predict', 'interactbot', 'riseapi', 'download', + 'install'}) parser.add_argument("config_path", help="path to a pipeline json config", type=str) parser.add_argument("-t", "--token", help="telegram bot token", type=str) parser.add_argument("-b", "--batch-size", dest="batch_size", default=1, help="inference batch size", type=int) @@ -75,6 +77,8 @@ def main(): start_model_server(pipeline_config_path) elif args.mode == 'predict': predict_on_stream(pipeline_config_path, args.batch_size, args.file_path) + elif args.mode == 'install': + install_from_config(pipeline_config_path) if __name__ == "__main__": diff --git a/deeppavlov/evolve.py b/deeppavlov/evolve.py new file mode 100644 index 0000000000..8deff2d21f --- /dev/null +++ b/deeppavlov/evolve.py @@ -0,0 +1,322 @@ +""" +Copyright 2017 Neural Networks and Deep Learning lab, MIPT + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import argparse +from pathlib import Path +import sys +import os +import json +from copy import deepcopy +from subprocess import Popen, PIPE +import pandas as pd + +p = (Path(__file__) / ".." / "..").resolve() +sys.path.append(str(p)) + +from deeppavlov.core.common.errors import ConfigError +from deeppavlov.models.evolution.evolution_param_generator import ParamsEvolution +from deeppavlov.core.common.file import read_json, save_json +from deeppavlov.core.common.log import get_logger +from deeppavlov.core.commands.utils import set_deeppavlov_root, expand_path + +log = get_logger(__name__) + +parser = argparse.ArgumentParser() + +parser.add_argument("config_path", help="path to a pipeline json config", type=str) +parser.add_argument('--key_main_model', help='key inserted in dictionary of main model in pipe', default="main") +parser.add_argument('--p_cross', help='probability of crossover', type=float, default=0.2) +parser.add_argument('--pow_cross', help='crossover power', type=float, default=0.1) +parser.add_argument('--p_mut', help='probability of mutation', type=float, default=1.) +parser.add_argument('--pow_mut', help='mutation power', type=float, default=0.1) + +parser.add_argument('--p_size', help='population size', type=int, default=10) +parser.add_argument('--gpus', help='visible GPUs divided by comma <<,>>', default="-1") +parser.add_argument('--train_partition', + help='partition of splitted train file', default=1) +parser.add_argument('--start_from_population', + help='population number to start from. 0 means from scratch', default=0) +parser.add_argument('--path_to_population', + help='path to population to start from', default="") +parser.add_argument('--elitism_with_weights', + help='whether to save elite models with weights or without', action='store_true') +parser.add_argument('--iterations', help='Number of iterations', type=int, default=-1) + + +def find_config(pipeline_config_path: str): + if not Path(pipeline_config_path).is_file(): + configs = [c for c in Path(__file__).parent.glob(f'configs/**/{pipeline_config_path}.json') + if str(c.with_suffix('')).endswith(pipeline_config_path)] # a simple way to not allow * and ? + if configs: + log.info(f"Interpreting '{pipeline_config_path}' as '{configs[0]}'") + pipeline_config_path = str(configs[0]) + return pipeline_config_path + + +def main(): + args = parser.parse_args() + + pipeline_config_path = find_config(args.config_path) + key_main_model = args.key_main_model + population_size = args.p_size + gpus = [int(gpu) for gpu in args.gpus.split(",")] + train_partition = int(args.train_partition) + start_from_population = int(args.start_from_population) + path_to_population = args.path_to_population + elitism_with_weights = args.elitism_with_weights + iterations = int(args.iterations) + + p_crossover = args.p_cross + pow_crossover = args.pow_cross + p_mutation = args.p_mut + pow_mutation = args.pow_mut + + if os.environ.get("CUDA_VISIBLE_DEVICES") is None: + pass + else: + cvd = [int(gpu) for gpu in os.environ.get("CUDA_VISIBLE_DEVICES").split(",")] + if gpus == [-1]: + gpus = cvd + else: + try: + gpus = [cvd[gpu] for gpu in gpus] + except: + raise ConfigError("Can not use gpus `{}` with CUDA_VISIBLE_DEVICES='{}'".format( + ",".join(gpus), ",".join(cvd) + )) + + basic_params = read_json(pipeline_config_path) + log.info("Given basic params: {}\n".format(json.dumps(basic_params, indent=2))) + + # Initialize evolution + evolution = ParamsEvolution(population_size=population_size, + p_crossover=p_crossover, crossover_power=pow_crossover, + p_mutation=p_mutation, mutation_power=pow_mutation, + key_main_model=key_main_model, + seed=42, + train_partition=train_partition, + elitism_with_weights=elitism_with_weights, + **basic_params) + + considered_metrics = evolution.get_value_from_config(evolution.basic_config, + list(evolution.find_model_path( + evolution.basic_config, "metrics"))[0] + ["metrics"]) + + log.info(considered_metrics) + evolve_metric = considered_metrics[0] + + # Create table variable for gathering results + set_deeppavlov_root(evolution.basic_config) + + expand_path(Path(evolution.get_value_from_config( + evolution.basic_config, evolution.main_model_path + ["save_path"]))).mkdir(parents=True, exist_ok=True) + + result_file = expand_path(Path(evolution.get_value_from_config(evolution.basic_config, + evolution.main_model_path + ["save_path"]) + ).joinpath("result_table.csv")) + + result_table_columns = [] + result_table_dict = {} + for el in considered_metrics: + result_table_dict[el + "_valid"] = [] + result_table_dict[el + "_test"] = [] + result_table_columns.extend([el + "_valid", el + "_test"]) + + result_table_dict["params"] = [] + result_table_columns.append("params") + + if start_from_population == 0: + # if starting evolution from scratch + iters = 0 + result_table = pd.DataFrame(result_table_dict) + # write down result table file + result_table.loc[:, result_table_columns].to_csv(result_file, index=False, sep='\t') + + log.info("Iteration #{} starts".format(iters)) + # randomly generate the first population + population = evolution.first_generation() + else: + # if starting evolution from already existing population + iters = start_from_population + log.info("Iteration #{} starts".format(iters)) + + population = [] + for i in range(population_size): + population.append(read_json(expand_path(Path(path_to_population).joinpath( + "model_" + str(i)).joinpath("config.json")))) + population[i] = evolution.insert_value_or_dict_into_config( + population[i], evolution.main_model_path + ["save_path"], + str(Path( + evolution.get_value_from_config(evolution.basic_config, evolution.main_model_path + ["save_path"]) + ).joinpath( + "population_" + str(start_from_population)).joinpath( + "model_" + str(i)).joinpath( + "model"))) + + population[i] = evolution.insert_value_or_dict_into_config( + population[i], evolution.main_model_path + ["load_path"], + str(Path( + evolution.get_value_from_config(population[i], evolution.main_model_path + ["load_path"])))) + + for path_id, path_ in enumerate(evolution.paths_to_fiton_dicts): + population[i] = evolution.insert_value_or_dict_into_config( + population[i], path_ + ["save_path"], + str(Path(evolution.get_value_from_config(evolution.basic_config, + evolution.main_model_path + ["save_path"]) + ).joinpath("population_" + str(iters)).joinpath("model_" + str(i)).joinpath( + "fitted_model_" + str(path_id)))) + + for path_id, path_ in enumerate(evolution.paths_to_fiton_dicts): + population[i] = evolution.insert_value_or_dict_into_config( + population[i], path_ + ["load_path"], + str(Path(evolution.get_value_from_config( + population[i], path_ + ["load_path"])))) + + run_population(population, evolution, gpus) + population_scores = results_to_table(population, evolution, considered_metrics, + result_file, result_table_columns)[evolve_metric] + log.info("Population scores: {}".format(population_scores)) + log.info("Iteration #{} was done".format(iters)) + iters += 1 + + while True: + if iterations != -1 and start_from_population + iterations == iters: + log.info("End of evolution on iteration #{}".format(iters)) + break + log.info("Iteration #{} starts".format(iters)) + population = evolution.next_generation(population, population_scores, iters) + run_population(population, evolution, gpus) + population_scores = results_to_table(population, evolution, considered_metrics, + result_file, result_table_columns)[evolve_metric] + log.info("Population scores: {}".format(population_scores)) + log.info("Iteration #{} was done".format(iters)) + iters += 1 + + +def run_population(population, evolution, gpus): + """ + Change save and load paths for obtained population, save config.json with model config, + run population via current python executor (with which evolve.py already run) + and on given devices (-1 means CPU, other integeres - visible for evolve.py GPUs) + Args: + population: list of dictionaries - configs of current population + evolution: ParamsEvolution + gpus: list of given devices (list of integers) + + Returns: + None + """ + population_size = len(population) + for k in range(population_size // len(gpus) + 1): + procs = [] + for j in range(len(gpus)): + i = k * len(gpus) + j + if i < population_size: + save_path = expand_path(Path(evolution.get_value_from_config( + population[i], evolution.main_model_path + ["save_path"])).parent) + + save_path.mkdir(parents=True, exist_ok=True) + f_name = save_path.joinpath("config.json") + save_json(population[i], f_name) + + if len(gpus) == 1 and gpus[0] == -1: + procs.append(Popen("{} -m deeppavlov train {}" + " 1>{}/out.txt 2>{}/err.txt".format(sys.executable, + str(f_name), + str(save_path), + str(save_path) + ), + shell=True, stdout=PIPE, stderr=PIPE)) + else: + procs.append(Popen("CUDA_VISIBLE_DEVICES={} {} -m deeppavlov train {}" + " 1>{}/out.txt 2>{}/err.txt".format(gpus[j], + sys.executable, + str(f_name), + str(save_path), + str(save_path) + ), + shell=True, stdout=PIPE, stderr=PIPE)) + for j, proc in enumerate(procs): + i = k * len(gpus) + j + log.info(f'Waiting on {i}th proc') + proc.wait() + return None + + +def results_to_table(population, evolution, considered_metrics, result_file, result_table_columns): + population_size = len(population) + validate_best = evolution.get_value_from_config(evolution.basic_config, + list(evolution.find_model_path( + evolution.basic_config, "validate_best"))[0] + + ["validate_best"]) + test_best = evolution.get_value_from_config(evolution.basic_config, + list(evolution.find_model_path( + evolution.basic_config, "test_best"))[0] + + ["test_best"]) + if (not validate_best) and test_best: + log.info("Validate_best is set to False. Tuning parameters on test") + elif (not validate_best) and (not test_best): + raise ConfigError("Validate_best and test_best are set to False. Can not evolve.") + + population_metrics = {} + for m in considered_metrics: + population_metrics[m] = [] + for i in range(population_size): + with open(str(expand_path(Path(evolution.get_value_from_config( + population[i], + evolution.main_model_path + ["save_path"])).parent.joinpath("out.txt"))), "r") as fout: + reports_data = fout.read().splitlines()[-2:] + reports = [] + for j in range(2): + try: + reports.append(json.loads(reports_data[j])) + except: + pass + + val_results = {} + test_results = {} + for m in considered_metrics: + val_results[m] = None + test_results[m] = None + if len(reports) == 2 and "valid" in reports[0].keys() and "test" in reports[1].keys(): + val_results = reports[0]["valid"]["metrics"] + test_results = reports[1]["test"]["metrics"] + elif len(reports) == 1 and "valid" in reports[0].keys(): + val_results = reports[0]["valid"]["metrics"] + elif len(reports) == 1 and "test" in reports[0].keys(): + test_results = reports[0]["test"]["metrics"] + + result_table_dict = {} + for el in result_table_columns: + result_table_dict[el] = [] + + for m in considered_metrics: + result_table_dict[m + "_valid"].append(val_results[m]) + result_table_dict[m + "_test"].append(test_results[m]) + if validate_best: + population_metrics[m].append(val_results[m]) + elif test_best: + population_metrics[m].append(test_results[m]) + + result_table_dict[result_table_columns[-1]] = [population[i]] + result_table = pd.DataFrame(result_table_dict) + result_table.loc[:, result_table_columns].to_csv(result_file, index=False, sep='\t', mode='a', header=None) + + return population_metrics + + +if __name__ == "__main__": + main() diff --git a/deeppavlov/metrics/fmeasure_classification.py b/deeppavlov/metrics/fmeasure_classification.py index 83ecc60c6a..22d318918c 100644 --- a/deeppavlov/metrics/fmeasure_classification.py +++ b/deeppavlov/metrics/fmeasure_classification.py @@ -19,13 +19,36 @@ from sklearn.metrics import f1_score from deeppavlov.core.common.metrics_registry import register_metric -from deeppavlov.models.classifiers.intents.utils import labels2onehot +from deeppavlov.models.classifiers.utils import labels2onehot @register_metric('classification_f1') def fmeasure(y_true, y_predicted, average="macro"): """ - Calculate F1-measure + Calculate F1-measure macro + Args: + y_true: array of true binary labels + y_predicted: list of predictions. + Each prediction is a tuple of two elements + (predicted_labels, dictionary like {"label_i": probability_i} ) + where probability is float or keras.tensor + average: determines the type of averaging performed on the data + + Returns: + F1-measure + """ + classes = np.array(list(y_predicted[0][1].keys())) + y_true_one_hot = labels2onehot(y_true, classes) + y_pred_labels = [y_predicted[i][0] for i in range(len(y_predicted))] + y_pred_one_hot = labels2onehot(y_pred_labels, classes) + + return f1_score(y_true_one_hot, y_pred_one_hot, average=average) + + +@register_metric('classification_f1_weighted') +def fmeasure(y_true, y_predicted, average="weighted"): + """ + Calculate F1-measure weighted Args: y_true: array of true binary labels y_predicted: list of predictions. diff --git a/deeppavlov/metrics/log_loss.py b/deeppavlov/metrics/log_loss.py new file mode 100644 index 0000000000..ec42196391 --- /dev/null +++ b/deeppavlov/metrics/log_loss.py @@ -0,0 +1,30 @@ +""" +Copyright 2017 Neural Networks and Deep Learning lab, MIPT + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from sklearn.metrics import log_loss +import numpy as np + +from deeppavlov.core.common.metrics_registry import register_metric +from deeppavlov.models.classifiers.utils import labels2onehot + + +@register_metric('classification_log_loss') +def classification_log_loss(y_true, y_predicted): + classes = np.array(list(y_predicted[0][1].keys())) + y_true_one_hot = labels2onehot(y_true, classes) + y_pred_probas = [list(y_predicted[i][1].values()) for i in range(len(y_predicted))] + + return log_loss(y_true_one_hot, y_pred_probas) diff --git a/deeppavlov/metrics/mrr_classification.py b/deeppavlov/metrics/mrr_classification.py new file mode 100644 index 0000000000..438b9fbbd7 --- /dev/null +++ b/deeppavlov/metrics/mrr_classification.py @@ -0,0 +1,97 @@ +""" +Copyright 2017 Neural Networks and Deep Learning lab, MIPT + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import numpy as np +import json +from scipy.stats import rankdata + +from deeppavlov.core.common.metrics_registry import register_metric +from deeppavlov.models.classifiers.utils import labels2onehot + + +def calc_mrr(rank): + rank = list(map(lambda x: 1./x, rank)) + return np.mean(rank) + + +def mrr_from_json(fname): + data = [] + with open(fname) as f: + for line in f.readlines(): + data += [json.loads(line)] + rank_i = [] + for elem in data: + cand = elem['candidates'] + results = elem['results'] + cand_ranks = (len(results) - rankdata(results, method='average'))[cand] + 1 + rank_i.append( min(cand_ranks)) + mrr = calc_mrr(rank_i) + return mrr + + +def mrr_from_dict(data): + rank_i = [] + for elem in data: + cand = elem['candidates'] + results = elem['results'] + cand_ranks = (len(results) - rankdata(results, method='average'))[cand] + 1 + rank_i.append( min(cand_ranks)) + mrr = calc_mrr(rank_i) + return mrr + + +def make_json_predictions(fname, predictions): + data = [] + with open(fname) as f: + for line in f.readlines(): + data += [json.loads(line)] + + pointer = 0 + for elem_id, elem in enumerate(data): + n = len(elem["sentences"]) + results = [] + for i in range(n): + if elem["sentences"][i] == "": + results.append(0) + else: + results.append(1 * (predictions[pointer])) + pointer += 1 + data[elem_id]["results"] = results + return data + + +@register_metric('classification_mrr') +def mrr_score(y_true, y_predicted): + # there is hard code for selqa dataset! + if len(y_predicted) == 66438: + data_type = "train" + elif len(y_predicted) == 9377: + data_type = "dev" + elif len(y_predicted) == 19435: + data_type = "test" + else: + return 0. + + classes = np.array(list(y_predicted[0][1].keys())) + y_true_one_hot = labels2onehot(y_true, classes) + y_pred_probas = [y_predicted[i][1]["correct"] for i in range(len(y_predicted))] + + json_with_predictions = make_json_predictions("/home/dilyara.baymurzina/evolution_data/selqa_data/SelQA-ass-" + + data_type + ".json", + y_pred_probas) + + score = mrr_from_dict(json_with_predictions) + return score diff --git a/deeppavlov/metrics/roc_auc_score.py b/deeppavlov/metrics/roc_auc_score.py index 568a8d680c..fb44eb7e6c 100644 --- a/deeppavlov/metrics/roc_auc_score.py +++ b/deeppavlov/metrics/roc_auc_score.py @@ -18,7 +18,7 @@ import numpy as np from deeppavlov.core.common.metrics_registry import register_metric -from deeppavlov.models.classifiers.intents.utils import labels2onehot +from deeppavlov.models.classifiers.utils import labels2onehot def roc_auc_score_np(y_true, y_pred): diff --git a/deeppavlov/models/api_requester/api_requester.py b/deeppavlov/models/api_requester/api_requester.py index 88bff9d9e2..1120728fc4 100644 --- a/deeppavlov/models/api_requester/api_requester.py +++ b/deeppavlov/models/api_requester/api_requester.py @@ -1,4 +1,5 @@ import requests +import asyncio from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @@ -6,7 +7,8 @@ @register('api_requester') class ApiRequester(Component): - def __init__(self, url: str, out: [int, list], param_names=(), debatchify=False, *args, **kwargs): + def __init__(self, url: str, out: [int, list], param_names=(), debatchify=False, *args, + **kwargs): self.url = url self.param_names = param_names self.out_count = out if isinstance(out, int) else len(out) @@ -20,8 +22,15 @@ def __call__(self, *args, **kwargs): for v in data.values(): batch_size = len(v) break - response = [requests.post(self.url, json={k: v[i] for k, v in data.items()}).json() - for i in range(batch_size)] + + assert batch_size > 0 + + async def collect(): + return [j async for j in self.get_async_response(data, batch_size)] + + loop = asyncio.get_event_loop() + response = loop.run_until_complete(collect()) + else: response = requests.post(self.url, json=data).json() @@ -29,3 +38,18 @@ def __call__(self, *args, **kwargs): response = list(zip(*response)) return response + + async def get_async_response(self, data, batch_size): + loop = asyncio.get_event_loop() + futures = [ + loop.run_in_executor( + None, + requests.post, + self.url, + None, + {k: v[i] for k, v in data.items()} + ) + for i in range(batch_size) + ] + for r in await asyncio.gather(*futures): + yield r.json() diff --git a/deeppavlov/models/api_requester/api_router.py b/deeppavlov/models/api_requester/api_router.py new file mode 100644 index 0000000000..61cb4678c5 --- /dev/null +++ b/deeppavlov/models/api_requester/api_router.py @@ -0,0 +1,33 @@ +from concurrent.futures import ProcessPoolExecutor +import concurrent + +from deeppavlov.core.common.registry import register +from deeppavlov.core.common.log import get_logger +from deeppavlov.core.models.component import Component + +logger = get_logger(__name__) + + +@register("api_router") +class ApiRouter(Component): + + def __init__(self, api_requesters, n_workers=1, *args, **kwargs): + self.api_requesters = api_requesters + self.n_workers = n_workers + + def __call__(self, *args, **kwargs): + with ProcessPoolExecutor(self.n_workers) as executor: + futures = [executor.submit(api_requester, *args) for api_requester + in + self.api_requesters] + + concurrent.futures.wait(futures) + results = [] + for future, api_requester in zip(futures, self.api_requesters): + result = future.result() + if api_requester.out_count > 1: + results += result + else: + results.append(result) + + return results diff --git a/deeppavlov/models/classifiers/intents/README.md b/deeppavlov/models/classifiers/README.md similarity index 93% rename from deeppavlov/models/classifiers/intents/README.md rename to deeppavlov/models/classifiers/README.md index 470966cc61..b48056ba9c 100644 --- a/deeppavlov/models/classifiers/intents/README.md +++ b/deeppavlov/models/classifiers/README.md @@ -10,7 +10,7 @@ The model can be used for binary, multi-class or multi-label classification. We also provide with **pre-trained models** for classification on DSTC 2 dataset, SNIPS dataset, "AG News" dataset, "Detecting Insults in Social Commentary", Twitter sentiment in Russian dataset. **DSTC 2 dataset** (http://camdial.org/~mh521/dstc/) does not initially contain information about **intents**, -therefore, `IntentDataset` (`deeppavlov/datasets/intent_dataset.py`) instance extracts +therefore, `Dstc2IntentsDatasetIterator` (`deeppavlov/dataset_iterators/dstc2_intents_interator.py`) instance extracts artificial intents for each user reply using information from acts and slots. Below we give several examples of intent construction: @@ -47,7 +47,7 @@ This message contains two intents `(thankyou, bye)`. Train, valid and test divis * SearchScreeningEvent * SearchCreativeWork -Initially, intent model on SNIPS dataset was trained only as an example of usage that is why we provide pre-trained model for SNIPS with embeddings trained on DSTC-2 dataset that is not the best choice for this task. Train set is divided to train and validation sets to illustrate `basic_classification_iterator` work. +Initially, classification model on SNIPS dataset was trained only as an example of usage that is why we provide pre-trained model for SNIPS with embeddings trained on DSTC-2 dataset that is not the best choice for this task. Train set is divided to train and validation sets to illustrate `basic_classification_iterator` work. **AG News** dataset (https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html) contains **sentiment classification** task for 5 classes (range from 0 to 4 points scale). Test set is initial one from web-site, valid is a Stratified division 1/5 from the train set from web-site with 42 seed, and the train set is the rest. @@ -144,10 +144,10 @@ Some clue parameters for [intents_dstc2.json](../../../configs/intents/intents_d | Parameter | Description | |---------------------|-------------------------------------------------------------------| | **dataset_reader** | **an object that reads datasets from files** | -| name | registered name of the dataset reader
*SetOfValues*: "dstc2_datasetreader", "classification_datasetreader" | +| name | registered name of the dataset reader
*SetOfValues*: "dstc2_reader", "basic_classification_reader" | | data_path | directory where data files are stored | | **dataset_iterator** | **an object that provides models with data in the standard form (each example is a tuple (x, y) where x and y could be numbers, booleans, lists or strings)** | -| name | registered name of the dataset
*SetOfValues*: "intent_dataset", classification_dataset" | +| name | registered name of the dataset
*SetOfValues*: "dstc2_intents_iterator", basic_classification_iterator" | | seed | seed for the batch generator | | fields_to_merge | list of fields to merge
*SetOfValues*: list of fields, i.e ["train", "valid", "test"]| | merged_field | name of the field where the merged fields should be saved
*SetOfValues*: field, i.e "train", "valid", "test" | @@ -185,7 +185,7 @@ Some clue parameters for [intents_dstc2.json](../../../configs/intents/intents_d | load_path | path to file from which model files will be loaded | | save_path | path to file where model files will be saved | | classes | list of class names. In this case they could be simply obtained from vocab `classes_vocab.keys()` method. To make reference one has to set value to "#classes_vocab.keys()" | -| model_name | method of the class KerasIntentModel that corresponds to the model
*SetOfValues*: `cnn_model`, `dcnn_model`, `cnn_model_max_and_aver_pool`, `bilstm_model`, `bilstm_bilstm_model`, `bilstm_cnn_model`, `cnn_bilstm_model`, `bilstm_self_add_attention_model`, `bilstm_self_mult_attention_model`, `bigru_model` | +| model_name | method of the class KerasClassificationModel that corresponds to the model
*SetOfValues*: `cnn_model`, `dcnn_model`, `cnn_model_max_and_aver_pool`, `bilstm_model`, `bilstm_bilstm_model`, `bilstm_cnn_model`, `cnn_bilstm_model`, `bilstm_self_add_attention_model`, `bilstm_self_mult_attention_model`, `bigru_model` | | text_size | length of each sample in words | | confident_threshold | probability threshold for an instance belonging to a class
*SetOfValues*: \[0., 1.\] | | lear_rate | learning rate for training | @@ -220,9 +220,9 @@ python deep.py train configs/intents/intents_dstc2.json ### Train on other datasets -Constructing intents from DSTC 2 makes `IntentDataset` difficult to use. -Therefore, we also provide another dataset reader `ClassificationDatasetReader` and dataset `ClassificationDataset` -to work with `.csv` files. These classes are described in `deeppavlov/dataset_readers` and `deeppavlov/datasets`. +Constructing intents from DSTC 2 makes `Dstc2IntentsDatasetIterator` difficult to use. +Therefore, we also provide another dataset reader `BasicClassificationDatasetReader` and dataset `BasicClassificationDatasetIterator` +to work with `.csv` files. These classes are described in `deeppavlov/dataset_readers/basic_classification_reader.py` and `deeppavlov/dataset_iterators/basic_classification_dataset_iterator.py`. Training data file `train.csv` (and `valid.csv`, if exists) should be in the following format: diff --git a/deeppavlov/models/classifiers/intents/intent_model.py b/deeppavlov/models/classifiers/keras_classification_model.py similarity index 98% rename from deeppavlov/models/classifiers/intents/intent_model.py rename to deeppavlov/models/classifiers/keras_classification_model.py index 36f821110d..c14faf4cfd 100644 --- a/deeppavlov/models/classifiers/intents/intent_model.py +++ b/deeppavlov/models/classifiers/keras_classification_model.py @@ -29,9 +29,9 @@ from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.keras_model import KerasModel -from deeppavlov.models.classifiers.intents.utils import labels2onehot, proba2labels +from deeppavlov.models.classifiers.utils import labels2onehot, proba2labels +from deeppavlov.models.classifiers.utils import md5_hashsum from deeppavlov.models.embedders.fasttext_embedder import FasttextEmbedder -from deeppavlov.models.classifiers.intents.utils import md5_hashsum from deeppavlov.models.tokenizers.nltk_tokenizer import NLTKTokenizer from deeppavlov.core.common.log import get_logger from deeppavlov.core.layers.keras_layers import additive_self_attention, multiplicative_self_attention @@ -40,8 +40,8 @@ log = get_logger(__name__) -@register('intent_model') -class KerasIntentModel(KerasModel): +@register('keras_classification_model') +class KerasClassificationModel(KerasModel): """ Class implements keras model for intent recognition task for multi-class multi-label data """ @@ -124,6 +124,7 @@ def __init__(self, **kwargs): if self.opt['fasttext_md5'] != current_fasttext_md5: raise ConfigError( "Given fasttext model does NOT match fasttext model used previously to train loaded model") + print("Model was successfully initialized!\nModel summary:\n{}".format(self.model.summary())) def _init_missed_params(self): """ diff --git a/deeppavlov/models/classifiers/intents/utils.py b/deeppavlov/models/classifiers/utils.py similarity index 93% rename from deeppavlov/models/classifiers/intents/utils.py rename to deeppavlov/models/classifiers/utils.py index a8620f31ef..36c9f4fccd 100644 --- a/deeppavlov/models/classifiers/intents/utils.py +++ b/deeppavlov/models/classifiers/utils.py @@ -35,16 +35,14 @@ def labels2onehot(labels, classes): 2d array with one-hot representation of given samples """ n_classes = len(classes) - eye = np.eye(n_classes) y = [] for sample in labels: curr = np.zeros(n_classes) for intent in sample: if intent not in classes: - log.warning('Unknown intent {} detected'.format(intent)) - curr += eye[np.where(np.array(classes) == 'unknown')[0]].reshape(-1) + log.warning('Unknown intent {} detected. Assigning no class'.format(intent)) else: - curr += eye[np.where(np.array(classes) == intent)[0]].reshape(-1) + curr[np.where(np.array(classes) == intent)[0]] = 1 y.append(curr) y = np.asarray(y) return y diff --git a/deeppavlov/models/evolution/README.md b/deeppavlov/models/evolution/README.md new file mode 100644 index 0000000000..2dd013abbc --- /dev/null +++ b/deeppavlov/models/evolution/README.md @@ -0,0 +1,56 @@ +[![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](/LICENSE.txt) +![Python 3.6](https://img.shields.io/badge/python-3.6-green.svg) + +# Parameters evolution for DeepPavlov models + +This repository contains implementation of parameters evolution for DeepPavlov models. + +Evolution process can be described in the following way: +* Initialize parameters of evolutionary process setting the following arguments to `evolve.py`: + - `--p_size` - number of individuals (models) per population (*Default: 10*). + - `--key_main_model` - key of the dictionary in config containing the model being trained (see description below) (Default: "main"). + - `--p_cross` - probability of crossover for a parent pair (*Default: 0.2*). + - `--pow_cross` - crossover power - portion of evolving parameters that will be exchanged between parents during crossover (Default: 0.1). + - `--p_mut` - probability of mutation for a parameter (*Default: 1.*). + - `--pow_mut` - mutation power - maximal portion of maximal possible value of parameter which can be added or subtracted during mutation (Default: 0.1). + - `--gpus` - available GPUs divided by comma "," (*Default: -1 means CPU support*). If one runs `evolve.py` with assigned `CUDA_VISIBLE_DEVICES`, gpus are either ordinal numbers of device within those from `CUDA_VISIBLE_DEVICES` (e.g. `CUDA_VISIBLE_DEVICES=3,4,5` and `--gpus 1,2` mean running models on `4,5` original GPUs) or all devices from `CUDA_VISIBLE_DEVICES` if gpus is not given. + - `--train_partition` - if train file is too big to train (recommeded to divide train files if train dataset is more than 100 thousands examples), one can split it in `train_partition` number of files, save it calling "any_name_{0}.any_extension", ..., "any_name_{`train_partition`}.any_extension". In dataset_reader "train" field indicate the first one file. Population is trained on the N_{population} % `train_partition` part of the dataset (*Default: 1*). + - `--start_from_population` - the number of population to start from that is needed to restart population (*Default: 0 means starts from 0 population*). + - `--path_to_population` - path to the directory "population_{`start_from_population`}". Should be given if `start_from_population` is not 0 (*Default: ""*). + - `--elitism_with_weights` - whether to initialize elite models with pre-trained weights from previous population or not (*Default: False means save elite models without weights. If parameter is given, then save elite models with weights*). + - `--iterations` - number of iterations to conduct (*Default: -1 means infinite number of iterations (while loop)*). + +* **Warning**: `metrics` can not be evolved because the main metric determines evolutionary process. + +* Current version allows to evolve any parameter of the config that is an item of some dictionary in config file. One can make a copy of a usual DeepPavlov model config, and reassign parameters that can be tuned during evolution. +To evolve some parameter one has to assign it to a dictionary of one of the following type: + - ```{"evolve_range": [min_value, max_value]}``` - values uniformly distributed on the given interval, + - ```{"evolve_range": [min_value, max_value], "scale": "log"}``` - values distributed on the given interval logariphmically, + - ```{"evolve_range": [min_value, max_value], "discrete": true}``` - discrete values uniformly distributed on the following interval, + - ```{"evolve_bool": true}``` - bool values, + - ```{"evolve_choice": true, "values": [value_0, ..., value_n]}``` - values uniformly taking on of the given values. + +* Choose the main model in the pipe being evolved. Find or add extra parameter that determines this model (for example, existing `"main": true`). The dictionary - model containing this parameter as a key will be trained (do not forget to give this parameter's name to `key_main_model`). Change `save_path` and `load_path` of this model to any ABSOLUTE paths (VERY IMPORTANT) to folder where population will be saved. + +* All the models in pipe that contain key `fit_on` will be trained every time separately for each model and saved to the same directory with model and called `fitted_model_{i}`. + +That's all you need to change in the config. Now let's mode on to the example. + +## Example + +* If one prefers to run evolution on some provided by DeepPavlov dataset, +firstly, download embeddings and datasets. +Consider parameters evolution on SNIPS dataset, download data running the following command providing +corresponding name of the config file: +``` +cd deeppavlov +python deep.py download configs/intents/intents_snips.json +``` +* To evolve the model run the following command providing corresponding name of the config file (see above): +``` +cd deeppavlov +python evolve.py configs/evolution/evolve_intents_snips.json +``` +* Folder `download/evolution/classification/intents_snips` will be created. Each population will be saved in a folder `download/evolution/classification/intents_snips/population_i` each of which contains `population_size` folders `model_i` consisting of saved model files explicitly, saved files of models from pipe that has a key "fit_on", `out.txt` and `err.txt` with logs of `deep.py train` script from training each model separately, and `config.json` with config for this individual. + +* Now one can open iPython Notebook file `deeppavlov/models/evolution/Results_analysis.ipynb`, set `CONFIG_FILE` to config file path and run cells to see evolution results. diff --git a/deeppavlov/models/evolution/Results_analysis.ipynb b/deeppavlov/models/evolution/Results_analysis.ipynb new file mode 100644 index 0000000000..cd5b839053 --- /dev/null +++ b/deeppavlov/models/evolution/Results_analysis.ipynb @@ -0,0 +1,348 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from pathlib import Path\n", + "import matplotlib.pyplot as plt\n", + "import copy\n", + "import json\n", + "%matplotlib inline\n", + "\n", + "from deeppavlov.core.commands.utils import set_deeppavlov_root, expand_path\n", + "from deeppavlov.models.evolution.evolution_param_generator import ParamsEvolution" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set here path to your config file, key main model and population size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "CONFIG_FILE = \"../../configs/evolution/evolve_intents_snips.json\"\n", + "KEY_MAIN_MODEL = \"main\"\n", + "POPULATION_SIZE = 2\n", + " \n", + "with open(CONFIG_FILE, \"r\") as f:\n", + " basic_params = json.load(f)\n", + "\n", + "set_deeppavlov_root(basic_params)\n", + "print(\"Considered basic config:\\n{}\".format(json.dumps(basic_params, indent=2)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evolution = ParamsEvolution(population_size=POPULATION_SIZE,\n", + " key_main_model=KEY_MAIN_MODEL,\n", + " **basic_params)\n", + "\n", + "validate_best = evolution.get_value_from_config(\n", + " evolution.basic_config, list(evolution.find_model_path(\n", + " evolution.basic_config, \"validate_best\"))[0] + [\"validate_best\"])\n", + "test_best = evolution.get_value_from_config(\n", + " evolution.basic_config, list(evolution.find_model_path(\n", + " evolution.basic_config, \"test_best\"))[0] + [\"test_best\"])\n", + "\n", + "TITLE = str(Path(evolution.get_value_from_config(\n", + " evolution.basic_config, evolution.main_model_path + [\"save_path\"])).stem)\n", + "print(\"Title name for the considered evolution is `{}`.\".format(TITLE))\n", + "\n", + "data = pd.read_csv(str(expand_path(Path(evolution.get_value_from_config(\n", + " evolution.basic_config, evolution.main_model_path + [\"save_path\"])).joinpath(\n", + " \"result_table.csv\"))), sep='\\t')\n", + "print(\"Number of populations: {}.\".format(int(data.shape[0] / POPULATION_SIZE)))\n", + "data.fillna(0., inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MEASURES = evolution.get_value_from_config(\n", + " evolution.basic_config, list(evolution.find_model_path(\n", + " evolution.basic_config, \"metrics\"))[0] + [\"metrics\"])\n", + "\n", + "for measure in MEASURES:\n", + " print(\"\\nMeasure: {}\".format(measure))\n", + " for data_type in [\"valid\", \"test\"]:\n", + " print(\"{}:\".format(data_type))\n", + " argmin = data[measure + \"_\" + data_type].argmin()\n", + " argmax = data[measure + \"_\" + data_type].argmax()\n", + " print(\"min for\\t{} model on\\t{} population\".format(argmin % POPULATION_SIZE,\n", + " argmin // POPULATION_SIZE))\n", + " print(\"max for\\t{} model on\\t{} population\".format(argmax % POPULATION_SIZE,\n", + " argmax // POPULATION_SIZE))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## If you want to plot measures depending on population colored by evolved measure value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "path_to_pics = expand_path(Path(evolution.get_value_from_config(\n", + " evolution.basic_config, evolution.main_model_path + [\"save_path\"])).joinpath(\"pics\"))\n", + "path_to_pics.mkdir(exist_ok=True, parents=True)\n", + "\n", + "if validate_best:\n", + " evolve_metric = MEASURES[0] + \"_valid\"\n", + "elif test_best:\n", + " evolve_metric = MEASURES[0] + \"_test\"\n", + " \n", + "cmap = plt.get_cmap('rainbow')\n", + "colors = [cmap(i) for i in np.linspace(0, 1, data.shape[0])]\n", + "color_ids = np.argsort(data.loc[:, evolve_metric].values)\n", + "\n", + "ylims = [(0., 1)] * len(MEASURES)\n", + "\n", + "for metric, ylim in zip(MEASURES, ylims):\n", + " plt.figure(figsize=(12,6))\n", + " if validate_best:\n", + " for i in range(data.shape[0]):\n", + " plt.scatter(i // POPULATION_SIZE, \n", + " data.loc[:, metric + \"_valid\"].values[i], \n", + " c=colors[np.where(color_ids == i)[0][0]], alpha=0.5, marker='o')\n", + " plt.plot(np.arange(data.shape[0]//POPULATION_SIZE), \n", + " data.loc[:, metric + \"_valid\"].max() * np.ones(data.shape[0]//POPULATION_SIZE), \n", + " c=colors[-1])\n", + " plt.plot(np.arange(data.shape[0]//POPULATION_SIZE), \n", + " data.loc[:, metric + \"_valid\"].min() * np.ones(data.shape[0]//POPULATION_SIZE), \n", + " c=colors[0])\n", + " if test_best:\n", + " for i in range(data.shape[0]):\n", + " plt.scatter(i // POPULATION_SIZE, \n", + " data.loc[:, metric + \"_test\"].values[i], \n", + " c=colors[np.where(color_ids == i)[0][0]], alpha=0.5, marker='+', s=200)\n", + " plt.plot(np.arange(data.shape[0]//POPULATION_SIZE), \n", + " data.loc[:, metric + \"_test\"].max() * np.ones(data.shape[0]//POPULATION_SIZE), \"--\",\n", + " c=colors[-1])\n", + " plt.plot(np.arange(data.shape[0]//POPULATION_SIZE), \n", + " data.loc[:, metric + \"_test\"].min() * np.ones(data.shape[0]//POPULATION_SIZE), \"--\",\n", + " c=colors[0])\n", + " \n", + "\n", + " plt.ylabel(metric, fontsize=20)\n", + " plt.xlabel(\"population\", fontsize=20)\n", + " plt.title(TITLE, fontsize=20)\n", + " plt.ylim(ylim[0], ylim[1])\n", + " plt.xticks(fontsize=20)\n", + " plt.yticks(fontsize=20)\n", + " plt.savefig(path_to_pics.joinpath(metric + \".png\"))\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "## If you want to plot measures depending on population colored by `evolution_model_id`\n", + "\n", + "#### That means model of the same `id` are of the same color." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params_dictionaries = []\n", + "models_ids = []\n", + "\n", + "for i in range(data.shape[0]):\n", + " data.loc[i, \"params\"] = data.loc[i, \"params\"].replace(\"False\", \"false\")\n", + " data.loc[i, \"params\"] = data.loc[i, \"params\"].replace(\"True\", \"true\")\n", + " json_acceptable_string = data.loc[i, \"params\"].replace(\"'\", \"\\\"\")\n", + " d = json.loads(json_acceptable_string)\n", + " params_dictionaries.append(d)\n", + " models_ids.append(d[\"evolution_model_id\"])\n", + "\n", + "models_ids = np.array(models_ids)\n", + "models_ids" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "cmap = plt.get_cmap('rainbow')\n", + "colors = [cmap(i) for i in np.linspace(0, 1, len(np.unique(models_ids)))]\n", + "\n", + "ylims = [(0., 1)] * len(MEASURES)\n", + "\n", + "for metric, ylim in zip(MEASURES, ylims):\n", + " plt.figure(figsize=(12,6))\n", + " if validate_best:\n", + " for i in range(data.shape[0]):\n", + " plt.scatter(i // POPULATION_SIZE, \n", + " data.loc[:, metric + \"_valid\"].values[i], \n", + "# c=colors[models_ids[i]], alpha=0.5, marker='o')\n", + " c=colors[np.where(models_ids[i] == np.unique(models_ids))[0][0]], alpha=0.5, marker='o')\n", + " \n", + " plt.plot(np.arange(data.shape[0]//POPULATION_SIZE), \n", + " data.loc[:, metric + \"_valid\"].max() * np.ones(data.shape[0]//POPULATION_SIZE), \n", + " c=colors[-1])\n", + " plt.plot(np.arange(data.shape[0]//POPULATION_SIZE), \n", + " data.loc[:, metric + \"_valid\"].min() * np.ones(data.shape[0]//POPULATION_SIZE), \n", + " c=colors[0])\n", + " if test_best:\n", + " for i in range(data.shape[0]):\n", + " plt.scatter(i // POPULATION_SIZE, \n", + " data.loc[:, metric + \"_test\"].values[i], \n", + " c=colors[np.where(models_ids[i] == np.unique(models_ids))[0][0]], alpha=0.5, marker='+', s=200)\n", + " plt.plot(np.arange(data.shape[0]//POPULATION_SIZE), \n", + " data.loc[:, metric + \"_test\"].max() * np.ones(data.shape[0]//POPULATION_SIZE), \"--\",\n", + " c=colors[-1])\n", + " plt.plot(np.arange(data.shape[0]//POPULATION_SIZE), \n", + " data.loc[:, metric + \"_test\"].min() * np.ones(data.shape[0]//POPULATION_SIZE), \"--\",\n", + " c=colors[0])\n", + " \n", + "\n", + " plt.ylabel(metric, fontsize=20)\n", + " plt.xlabel(\"population\", fontsize=20)\n", + " plt.title(TITLE, fontsize=20)\n", + " plt.ylim(ylim[0], ylim[1])\n", + " plt.xticks(fontsize=20)\n", + " plt.yticks(fontsize=20)\n", + " plt.savefig(path_to_pics.joinpath(metric + \"_colored_ids.png\"))\n", + " plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "cmap = plt.get_cmap('rainbow')\n", + "colors = [cmap(i) for i in np.linspace(0, 1, data.shape[0])]\n", + "color_ids = np.argsort(data.loc[:, evolve_metric].values)\n", + "\n", + "for param_path in evolution.paths_to_evolving_params:\n", + " param_name = param_path[-1]\n", + " print(param_path, param_name)\n", + " \n", + " plt.figure(figsize=(12,12))\n", + " for i in range(data.shape[0]):\n", + " param_dict = evolution.get_value_from_config(evolution.basic_config, param_path)\n", + " if param_dict.get(\"evolve_range\") and param_dict.get(\"discrete\"):\n", + " plt.scatter(i // POPULATION_SIZE, \n", + " evolution.get_value_from_config(params_dictionaries[i], param_path),\n", + "# + (np.random.random() - 0.5) / 2,\n", + " c=colors[np.where(color_ids == i)[0][0]], alpha=0.5)\n", + " elif param_dict.get(\"evolve_range\"):\n", + " plt.scatter(i // POPULATION_SIZE, \n", + " evolution.get_value_from_config(params_dictionaries[i], param_path),\n", + " c=colors[np.where(color_ids == i)[0][0]], alpha=0.5)\n", + " elif param_dict.get(\"evolve_choice\"):\n", + " values = np.array(param_dict.get(\"values\"))\n", + " plt.scatter(i // POPULATION_SIZE, \n", + " np.where(values == evolution.get_value_from_config(\n", + " params_dictionaries[i], param_path))[0][0],\n", + " c=colors[np.where(color_ids == i)[0][0]], alpha=0.5)\n", + " plt.yticks(np.arange(len(values)), values, fontsize=20)\n", + " elif param_dict.get(\"evolve_bool\"):\n", + " values = np.array([False, True])\n", + " plt.scatter(i // POPULATION_SIZE, \n", + " np.where(values == evolution.get_value_from_config(\n", + " params_dictionaries[i], param_path))[0][0],\n", + " c=colors[np.where(color_ids == i)[0][0]], alpha=0.5)\n", + " plt.yticks(np.arange(len(values)), [\"False\", \"True\"], fontsize=20)\n", + "\n", + " plt.ylabel(param_name, fontsize=20)\n", + " plt.xlabel(\"population\", fontsize=20)\n", + " plt.title(TITLE, fontsize=20)\n", + " plt.xticks(fontsize=20)\n", + " plt.yticks(fontsize=20)\n", + " plt.savefig(path_to_pics.joinpath(param_name + \".png\"))\n", + " plt.show()\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python-deep36", + "language": "python", + "name": "deep36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/deeppavlov/models/evolution/evolution_param_generator.py b/deeppavlov/models/evolution/evolution_param_generator.py new file mode 100644 index 0000000000..1ad3894860 --- /dev/null +++ b/deeppavlov/models/evolution/evolution_param_generator.py @@ -0,0 +1,589 @@ +import numpy as np +from copy import deepcopy +from pathlib import Path +import json +import random + +from deeppavlov.core.common.registry import register +from deeppavlov.core.common.file import read_json +from deeppavlov.core.common.log import get_logger + + +log = get_logger(__name__) + + +@register('params_evolution') +class ParamsEvolution: + """ + Class performs full evolutionary process (task scores -> max): + 1. initializes random population + 2. makes replacement to get next generation: + a. selection according to obtained scores + b. crossover (recombination) with given probability p_crossover + c. mutation with given mutation rate p_mutation (probability to mutate) + according to given mutation power sigma + (current mutation power is randomly from -sigma to sigma) + """ + + def __init__(self, + population_size, + p_crossover=0.5, crossover_power=0.5, + p_mutation=0.5, mutation_power=0.1, + key_main_model="main", + seed=None, + train_partition=1, + elitism_with_weights=False, + **kwargs): + """ + Initialize evolution with random population + Args: + population_size: number of individuums per generation + p_crossover: probability to cross over for current replacement + crossover_power: part of EVOLVING parents parameters to exchange for offsprings + p_mutation: probability of mutation for current replacement + mutation_power: allowed percentage of mutation + key_model_to_evolve: binary flag that should be inserted into the dictionary + with main model in the basic config (to determine save and load paths that will be changed) + seed: random seed for initialization + train_partition: integer number of train data parts + elitism_with_weights: whether to save elite models with weigths or without + **kwargs: basic config with parameters + """ + + self.basic_config = deepcopy(kwargs) + self.main_model_path = list(self.find_model_path(self.basic_config, key_main_model))[0] + log.info("Main model path in config: {}".format(self.main_model_path)) + + self.population_size = population_size + self.p_crossover = p_crossover + self.p_mutation = p_mutation + self.mutation_power = mutation_power + self.crossover_power = crossover_power + self.elitism_with_weights = elitism_with_weights + + self.n_saved_best_pretrained = 0 + self.train_partition = train_partition + + self.paths_to_evolving_params = [] + for evolve_type in ["evolve_range", "evolve_choice", "evolve_bool"]: + for path_ in self.find_model_path(self.basic_config, evolve_type): + self.paths_to_evolving_params.append(path_) + + self.n_evolving_params = len(self.paths_to_evolving_params) + self.evolution_model_id = 0 + self.eps = 1e-6 + + self.paths_to_fiton_dicts = [] + for path_ in self.find_model_path(self.basic_config, "fit_on"): + self.paths_to_fiton_dicts.append(path_) + self.n_fiton_dicts = len(self.paths_to_fiton_dicts) + + try: + self.evolve_metric_optimization = self.get_value_from_config( + self.basic_config, list(self.find_model_path( + self.basic_config, "metric_optimization"))[0] + ["metric_optimization"]) + except: + self.evolve_metric_optimization = "maximize" + + if seed is None: + pass + else: + np.random.seed(seed) + random.seed(seed) + + def find_model_path(self, config, key_model, path=[]): + """ + Find path to dictionary in config that contains key 'key_model' + Args: + config: dictionary + key_model: key of sub-dictionary to be found + path: list of keys and/or integers (for list) with relative path (needed for recursion) + + Returns: + path in config -- list of keys (strings and integers) + """ + config_pointer = config + if type(config_pointer) is dict and key_model in config_pointer.keys(): + # main model is an element of chainer.pipe list + # main model is a dictionary and has key key_main_model + yield path + else: + if type(config_pointer) is dict: + for key in list(config_pointer.keys()): + for path_ in self.find_model_path(config_pointer[key], key_model, path + [key]): + yield path_ + elif type(config_pointer) is list: + for i in range(len(config_pointer)): + for path_ in self.find_model_path(config_pointer[i], key_model, path + [i]): + yield path_ + + @staticmethod + def insert_value_or_dict_into_config(config, path, value): + """ + Insert value to dictionary determined by path[:-1] in field with key path[-1] + Args: + config: dictionary + path: list of keys and/or integers (for list) + value: value to be inserted + + Returns: + config with inserted value + """ + config_copy = deepcopy(config) + config_pointer = config_copy + for el in path[:-1]: + if type(config_pointer) is dict: + config_pointer = config_pointer.setdefault(el, {}) + elif type(config_pointer) is list: + config_pointer = config_pointer[el] + else: + pass + config_pointer[path[-1]] = value + return config_copy + + @staticmethod + def get_value_from_config(config, path): + """ + Return value of config element determined by path + Args: + config: dictionary + path: list of keys and/or integers (for list) + + Returns: + value + """ + config_copy = deepcopy(config) + config_pointer = config_copy + for el in path[:-1]: + if type(config_pointer) is dict: + config_pointer = config_pointer.setdefault(el, {}) + elif type(config_pointer) is list: + config_pointer = config_pointer[el] + else: + pass + return config_pointer[path[-1]] + + def initialize_params_in_config(self, basic_config, paths): + """ + Randomly initialize all the changable parameters in config + Args: + basic_config: config where changable parameters are dictionaries with keys + `evolve_range`, `evolve_bool`, `evolve_choice` + paths: paths to changable parameters + + Returns: + config + """ + config = deepcopy(basic_config) + for path_ in paths: + param_name = path_[-1] + value = self.get_value_from_config(basic_config, path_) + if type(value) is dict: + if value.get("evolve_choice") or value.get("evolve_range") or value.get("evolve_bool"): + config = self.insert_value_or_dict_into_config(config, + path_, + self.sample_params( + **{param_name: + deepcopy(value)})[param_name]) + + return config + + def first_generation(self, iteration=0): + """ + Initialize first generation randomly according to the given constraints is self.params + Args: + iteration: number of iteration + + Returns: + first generation that consists of self.population_size individuums + """ + population = [] + for i in range(self.population_size): + population.append(self.initialize_params_in_config(self.basic_config, self.paths_to_evolving_params)) + for which_path in ["save_path", "load_path"]: + population[-1] = self.insert_value_or_dict_into_config( + population[-1], self.main_model_path + [which_path], + str(Path(self.get_value_from_config(self.basic_config, self.main_model_path + [which_path]) + ).joinpath("population_" + str(iteration)).joinpath("model_" + str(i)).joinpath("model"))) + for path_id, path_ in enumerate(self.paths_to_fiton_dicts): + suffix = Path(self.get_value_from_config(self.basic_config, + path_ + ["save_path"])).suffix + for which_path in ["save_path", "load_path"]: + population[-1] = self.insert_value_or_dict_into_config( + population[-1], path_ + [which_path], + str(Path(self.get_value_from_config(self.basic_config, self.main_model_path + [which_path]) + ).joinpath("population_" + str(iteration)).joinpath("model_" + str(i)).joinpath( + "fitted_model_" + str(path_id)).with_suffix(suffix))) + population[-1]["evolution_model_id"] = self.evolution_model_id + self.evolution_model_id += 1 + + return population + + def next_generation(self, generation, scores, iteration): + """ + Provide replacement + Args: + generation: current generation (set of self.population_size configs + scores: corresponding scores that should be maximized + iteration: iteration number + + Returns: + the next generation according to the given scores of current generation + """ + + next_population = self.selection_of_best_with_weights(generation, scores) + log.info("Saved with weights: {} models".format(self.n_saved_best_pretrained)) + offsprings = self.crossover(generation, scores) + + changable_next = self.mutation(offsprings) + + next_population.extend(changable_next) + + for i in range(self.n_saved_best_pretrained): + # if several train files: + if self.train_partition != 1: + file_ext = str(Path(next_population[i]["dataset_reader"]["train"]).suffix) + next_population[i]["dataset_reader"]["train"] = "_".join( + [str(p) for p in Path(next_population[i]["dataset_reader"]["train"]).stem.split("_")[:-1]])\ + + "_" + str(iteration % self.train_partition) + file_ext + try: + # re-init learning rate with the final one (works for KerasModel) + next_population[i] = self.insert_value_or_dict_into_config( + next_population[i], + self.main_model_path + ["lear_rate"], + read_json(str(Path(self.get_value_from_config(next_population[i], + self.main_model_path + ["save_path"]) + ).parent.joinpath("model_opt.json")))["final_lear_rate"]) + except: + pass + + # load_paths + if self.elitism_with_weights: + # if elite models are saved with weights + next_population[i] = self.insert_value_or_dict_into_config( + next_population[i], + self.main_model_path + ["load_path"], + str(Path(self.get_value_from_config(next_population[i], + self.main_model_path + ["save_path"])))) + for path_id, path_ in enumerate(self.paths_to_fiton_dicts): + next_population[i] = self.insert_value_or_dict_into_config( + next_population[i], path_ + ["load_path"], + str(Path(self.get_value_from_config(next_population[i], + path_ + ["save_path"])))) + else: + # if elite models are saved only as configurations and trained again + next_population[i] = self.insert_value_or_dict_into_config( + next_population[i], + self.main_model_path + ["load_path"], + str(Path(self.get_value_from_config(self.basic_config, self.main_model_path + ["load_path"]) + ).joinpath("population_" + str(iteration)).joinpath("model_" + str(i)).joinpath("model"))) + for path_id, path_ in enumerate(self.paths_to_fiton_dicts): + suffix = Path(self.get_value_from_config(self.basic_config, + path_ + ["load_path"])).suffix + next_population[i] = self.insert_value_or_dict_into_config( + next_population[i], path_ + ["load_path"], + str(Path(self.get_value_from_config(self.basic_config, self.main_model_path + ["load_path"]) + ).joinpath("population_" + str(iteration)).joinpath("model_" + str(i)).joinpath( + "fitted_model_" + str(path_id)).with_suffix(suffix))) + + # save_paths + next_population[i] = self.insert_value_or_dict_into_config( + next_population[i], + self.main_model_path + ["save_path"], + str(Path(self.get_value_from_config(self.basic_config, self.main_model_path + ["save_path"]) + ).joinpath("population_" + str(iteration)).joinpath("model_" + str(i)).joinpath("model"))) + for path_id, path_ in enumerate(self.paths_to_fiton_dicts): + suffix = Path(self.get_value_from_config(self.basic_config, + path_ + ["save_path"])).suffix + next_population[i] = self.insert_value_or_dict_into_config( + next_population[i], path_ + ["save_path"], + str(Path(self.get_value_from_config(self.basic_config, self.main_model_path + ["save_path"]) + ).joinpath("population_" + str(iteration)).joinpath("model_" + str(i)).joinpath( + "fitted_model_" + str(path_id)).with_suffix(suffix))) + + for i in range(self.n_saved_best_pretrained, self.population_size): + # if several train files + if self.train_partition != 1: + file_ext = str(Path(next_population[i]["dataset_reader"]["train"]).suffix) + next_population[i]["dataset_reader"]["train"] = "_".join( + [str(p) for p in Path(next_population[i]["dataset_reader"]["train"]).stem.split("_")[:-1]])\ + + "_" + str(iteration % self.train_partition) + file_ext + for which_path in ["save_path", "load_path"]: + next_population[i] = self.insert_value_or_dict_into_config( + next_population[i], + self.main_model_path + [which_path], + str(Path(self.get_value_from_config(self.basic_config, self.main_model_path + [which_path]) + ).joinpath("population_" + str(iteration)).joinpath("model_" + str(i)).joinpath("model"))) + for path_id, path_ in enumerate(self.paths_to_fiton_dicts): + suffix = Path(self.get_value_from_config(self.basic_config, + path_ + ["save_path"])).suffix + for which_path in ["save_path", "load_path"]: + next_population[i] = self.insert_value_or_dict_into_config( + next_population[i], path_ + [which_path], + str(Path(self.get_value_from_config(self.basic_config, self.main_model_path + [which_path]) + ).joinpath("population_" + str(iteration)).joinpath("model_" + str(i)).joinpath( + "fitted_model_" + str(path_id)).with_suffix(suffix))) + + next_population[i]["evolution_model_id"] = self.evolution_model_id + self.evolution_model_id += 1 + + return next_population + + def selection_of_best_with_weights(self, population, scores): + """ + Select individuums to save with weights for the next generation from given population. + Range is an order of an individuum within sorted scores (1 range = max-score, self.population_size = min-score) + Individuum with the best score has probability equal to 1 (100%). + Individuum with the worst score has probability equal to 0 (0%). + Probability of i-th individuum to be selected with weights is (a * range_i + b) + where a = 1. / (1. - self.population_size), and + b = self.population_size / (self.population_size - 1.) + Args: + population: self.population_size individuums + scores: list of corresponding scores + + Returns: + selected self.n_saved_best_pretrained (changable) individuums + """ + ranges = self.range_scores(scores) + a = 1. / (1. - self.population_size) + b = self.population_size / (self.population_size - 1.) + probas_to_be_selected = a * ranges + b + + selected = [] + for i in range(self.population_size): + if self.decision(probas_to_be_selected[i]): + selected.append(deepcopy(population[i])) + + self.n_saved_best_pretrained = len(selected) + return selected + + def range_scores(self, scores): + """ + Ranges scores, + range 1 corresponds to the best score, + range self.population_size corresponds to the worst score. + Args: + scores: list of corresponding scores of population + + Returns: + ranges + """ + not_none_scores = np.array([x for x in scores if x is not None]) + if len(not_none_scores) == 0: + not_none_scores = np.array([0]) + min_score = np.min(not_none_scores) + max_score = np.max(not_none_scores) + for i in range(self.population_size): + if scores[i] is None: + if self.evolve_metric_optimization == "maximize": + scores[i] = min_score - self.eps + else: + scores[i] = max_score + self.eps + scores = np.array(scores, dtype='float') + + sorted_ids = np.argsort(scores) + if self.evolve_metric_optimization == "minimize": + sorted_ids = sorted_ids[::-1] + ranges = np.array([self.population_size - np.where(i == sorted_ids)[0][0] + for i in np.arange(self.population_size)]) + return ranges + + def crossover(self, population, scores): + """ + Recombine randomly population in pairs and cross over them with given probability. + Cross over from two parents produces two offsprings + each of which contains crossover_power portion of the parameter values from one parent, + and the other (1 - crossover_power portion) from the other parent + Args: + population: self.population_size individuums + scores: list of corresponding scores + + Returns: + (self.population_size - self.n_saved_best_pretained) offsprings + """ + offsprings = [] + + ranges = self.range_scores(scores) + a = 1. / (1. - self.population_size) + b = self.population_size / (self.population_size - 1.) + probas_to_be_parent = (a * ranges + b) / np.sum(a * ranges + b) + intervals = np.array([np.sum(probas_to_be_parent[:i]) for i in range(self.population_size)]) + + for i in range(self.population_size - self.n_saved_best_pretrained): + rs = np.random.random(2) + parents = population[np.where(rs[0] > intervals)[0][-1]], population[np.where(rs[1] > intervals)[0][-1]] + + if self.decision(self.p_crossover): + params_perm = np.random.permutation(self.n_evolving_params) + + curr_offsprings = [deepcopy(parents[0]), + deepcopy(parents[1])] + + part = int(self.crossover_power * self.n_evolving_params) + + for j in range(self.n_evolving_params - part, self.n_evolving_params): + curr_offsprings[0] = self.insert_value_or_dict_into_config(curr_offsprings[0], + self.paths_to_evolving_params[ + params_perm[j]], + self.get_value_from_config( + parents[1], + self.paths_to_evolving_params[ + params_perm[j]])) + + curr_offsprings[1] = self.insert_value_or_dict_into_config(curr_offsprings[1], + self.paths_to_evolving_params[ + params_perm[j]], + self.get_value_from_config( + parents[0], + self.paths_to_evolving_params[ + params_perm[j]])) + offsprings.append(deepcopy(curr_offsprings[0])) + else: + offsprings.append(deepcopy(parents[0])) + + return offsprings + + def mutation(self, population): + """ + Mutate each parameter of each individuum in population + Args: + population: self.population_size individuums + + Returns: + mutated population + """ + mutated = [] + + for individuum in population: + mutated_individuum = deepcopy(individuum) + for path_ in self.paths_to_evolving_params: + param_value = self.get_value_from_config(individuum, path_) + mutated_individuum = self.insert_value_or_dict_into_config( + mutated_individuum, path_, + self.mutation_of_param(path_, param_value)) + mutated.append(mutated_individuum) + + return mutated + + def mutation_of_param(self, param_path, param_value): + """ + Mutate particular parameter separately + Args: + param_path: path to parameter in basic config + param_value: current parameter valuer + + Returns: + mutated parameter value + """ + if self.decision(self.p_mutation): + param_name = param_path[-1] + basic_value = self.get_value_from_config(self.basic_config, param_path) + if type(basic_value) is dict: + if basic_value.get('discrete', False): + val = round(param_value + + ((2 * np.random.random() - 1.) * self.mutation_power + * self.sample_params(**{param_name: basic_value})[param_name])) + val = min(max(basic_value["evolve_range"][0], val), + basic_value["evolve_range"][1]) + new_mutated_value = val + elif 'evolve_range' in basic_value.keys(): + val = param_value + \ + ((2 * np.random.random() - 1.) * self.mutation_power + * self.sample_params(**{param_name: basic_value})[param_name]) + val = min(max(basic_value["evolve_range"][0], val), + basic_value["evolve_range"][1]) + new_mutated_value = val + elif basic_value.get("evolve_choice"): + new_mutated_value = self.sample_params(**{param_name: basic_value})[param_name] + elif basic_value.get("evolve_bool"): + new_mutated_value = self.sample_params(**{param_name: basic_value})[param_name] + else: + new_mutated_value = param_value + else: + new_mutated_value = param_value + else: + new_mutated_value = param_value + + return new_mutated_value + + def decision(self, probability): + """ + Make decision whether to do action or not with given probability + Args: + probability: probability whether to do action or not + + Returns: + bool decision + """ + r = np.random.random() + if r < probability: + return True + else: + return False + + def sample_params(self, **params): + """ + Sample parameters according to the given possible values + Args: + **params: dictionary {"param_0": {"evolve_range": [0, 10]}, + "param_1": {"evolve_range": [0, 10], "discrete": true}, + "param_2": {"evolve_range": [0, 1], "scale": "log"}, + "param_3": {"evolve_bool": true}, + "param_4": [0, 1, 2, 3]} + + Returns: + random parameter value + """ + if not params: + return {} + else: + params_copy = deepcopy(params) + params_sample = dict() + for param, param_val in params_copy.items(): + if isinstance(param_val, dict): + if 'evolve_bool' in param_val and param_val['evolve_bool']: + sample = bool(random.choice([True, False])) + elif 'evolve_range' in param_val: + sample = self._sample_from_ranges(param_val) + elif 'evolve_choice' in param_val: + sample = random.choice(param_val['values']) + params_sample[param] = sample + else: + params_sample[param] = params_copy[param] + return params_sample + + def _sample_from_ranges(self, opts): + """ + Sample parameters from ranges + Args: + opts: dictionary {"param_0": {"evolve_range": [0, 10]}, + "param_1": {"evolve_range": [0, 10], "discrete": true}, + "param_2": {"evolve_range": [0, 1], "scale": "log"}} + + Returns: + random parameter value from range + """ + from_ = opts['evolve_range'][0] + to_ = opts['evolve_range'][1] + if opts.get('scale', None) == 'log': + sample = self._sample_log(from_, to_) + else: + sample = np.random.uniform(from_, to_) + if opts.get('discrete', False): + sample = int(np.round(sample)) + return sample + + @staticmethod + def _sample_log(from_, to_): + """ + Sample parameters from ranges with log scale + Args: + from_: lower boundary of values + to_: upper boundary of values + + Returns: + random parameters value from range with log scale + """ + sample = np.exp(np.random.uniform(np.log(from_), np.log(to_))) + return float(sample) diff --git a/deeppavlov/skills/go_bot/README.md b/deeppavlov/models/go_bot/README.md similarity index 98% rename from deeppavlov/skills/go_bot/README.md rename to deeppavlov/models/go_bot/README.md index bf782aa36b..74ba3d7909 100644 --- a/deeppavlov/skills/go_bot/README.md +++ b/deeppavlov/models/go_bot/README.md @@ -13,16 +13,16 @@ Here is a simple example of interaction with a trained dialogue bot (can be down ```x:: >> Hello, welcome to the Cambridge restaurant system. You can ask for restaurants by area, price range or food type. How may I help you? x::hi, i want cheap food -2018-05-16 12:28:48.459 INFO in 'deeppavlov.skills.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'cheap'}, got 22 results. +2018-05-16 12:28:48.459 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'cheap'}, got 22 results. >> The lucky star serves chinese food in the cheap price range. x::is there anything with french food? -2018-05-16 12:29:02.795 INFO in 'deeppavlov.skills.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'cheap', 'food': 'french'}, got 0 results. +2018-05-16 12:29:02.795 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'cheap', 'food': 'french'}, got 0 results. >> Sorry there is no french restaurant in the cheap price range. x::ok, maybe there is something in expensive pricerange? -2018-05-16 12:29:23.337 INFO in 'deeppavlov.skills.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'expensive', 'food': 'french'}, got 2 results. +2018-05-16 12:29:23.337 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'expensive', 'food': 'french'}, got 2 results. >> Cote serves french food in the expensive price range. x::in what part of town is it? -2018-05-16 12:29:31.193 INFO in 'deeppavlov.skills.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'expensive', 'food': 'french'}, got 2 results. +2018-05-16 12:29:31.193 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'expensive', 'food': 'french'}, got 2 results. >> Restaurant two two is a nice place in the north of town serving tasty french food. x::ok, give me their phone number >> The phone number of restaurant two two is 01223 351880. @@ -31,10 +31,10 @@ x::address and post code too x::ok, thank you! >> Restaurant two two serves french food in the expensive price range. x::and if i wanted an italian restuarant? -2018-05-16 12:30:15.98 INFO in 'deeppavlov.skills.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'expensive', 'food': 'italian'}, got 4 results. +2018-05-16 12:30:15.98 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'expensive', 'food': 'italian'}, got 4 results. >> Frankie and bennys serves italian food in the expensive price range. x::more cheaply priced? -2018-05-16 12:30:36.560 INFO in 'deeppavlov.skills.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'cheap', 'food': 'italian'}, got 5 results. +2018-05-16 12:30:36.560 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 346: Made api_call with {'pricerange': 'cheap', 'food': 'italian'}, got 5 results. >> Da vinci pizzeria serves italian food in the cheap price range. x::ok, give me their phone number and address, please >> The phone number of da vinci pizzeria is 01223 351707 and it is on 20 milton road chesterton. diff --git a/deeppavlov/models/classifiers/intents/__init__.py b/deeppavlov/models/go_bot/__init__.py similarity index 100% rename from deeppavlov/models/classifiers/intents/__init__.py rename to deeppavlov/models/go_bot/__init__.py diff --git a/deeppavlov/skills/go_bot/bot.py b/deeppavlov/models/go_bot/bot.py similarity index 98% rename from deeppavlov/skills/go_bot/bot.py rename to deeppavlov/models/go_bot/bot.py index 5313ac5de5..8338a7c80b 100644 --- a/deeppavlov/skills/go_bot/bot.py +++ b/deeppavlov/models/go_bot/bot.py @@ -17,15 +17,13 @@ import re import numpy as np -from typing import Type from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register from deeppavlov.core.models.nn_model import NNModel -from deeppavlov.core.common.errors import ConfigError -from deeppavlov.skills.go_bot.network import GoalOrientedBotNetwork -import deeppavlov.skills.go_bot.templates as templ from deeppavlov.core.common.log import get_logger +from deeppavlov.models.go_bot.network import GoalOrientedBotNetwork +import deeppavlov.models.go_bot.templates as templ log = get_logger(__name__) diff --git a/deeppavlov/skills/go_bot/diagram.png b/deeppavlov/models/go_bot/diagram.png similarity index 100% rename from deeppavlov/skills/go_bot/diagram.png rename to deeppavlov/models/go_bot/diagram.png diff --git a/deeppavlov/skills/go_bot/metrics.py b/deeppavlov/models/go_bot/metrics.py similarity index 100% rename from deeppavlov/skills/go_bot/metrics.py rename to deeppavlov/models/go_bot/metrics.py diff --git a/deeppavlov/skills/go_bot/network.py b/deeppavlov/models/go_bot/network.py similarity index 100% rename from deeppavlov/skills/go_bot/network.py rename to deeppavlov/models/go_bot/network.py diff --git a/deeppavlov/skills/go_bot/templates.py b/deeppavlov/models/go_bot/templates.py similarity index 100% rename from deeppavlov/skills/go_bot/templates.py rename to deeppavlov/models/go_bot/templates.py diff --git a/deeppavlov/skills/go_bot/tracker.py b/deeppavlov/models/go_bot/tracker.py similarity index 100% rename from deeppavlov/skills/go_bot/tracker.py rename to deeppavlov/models/go_bot/tracker.py diff --git a/deeppavlov/models/morpho_tagger/common.py b/deeppavlov/models/morpho_tagger/common.py index 4a1fdb331c..ce4dac6872 100644 --- a/deeppavlov/models/morpho_tagger/common.py +++ b/deeppavlov/models/morpho_tagger/common.py @@ -5,7 +5,7 @@ from deeppavlov.core.commands.utils import set_deeppavlov_root, expand_path from deeppavlov.core.common.file import read_json from deeppavlov.core.common.params import from_params -from deeppavlov.core.common.registry import model as get_model +from deeppavlov.core.common.registry import get_model from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component @@ -13,7 +13,6 @@ from deeppavlov.models.morpho_tagger.common_tagger import make_pos_and_tag - def predict_with_model(config_path): config = read_json(config_path) set_deeppavlov_root(config) diff --git a/deeppavlov/models/ranking/README_TFIDF.md b/deeppavlov/models/ranking/README_TFIDF.md new file mode 100644 index 0000000000..6686f6c35c --- /dev/null +++ b/deeppavlov/models/ranking/README_TFIDF.md @@ -0,0 +1,112 @@ +# TFIDF ranker + +This is an implementation of a document ranker based on tfidf vectorization. The ranker implementation +is based on [DrQA](https://github.com/facebookresearch/DrQA) project. + + +### Config + +Default ranker config for **English** language is `deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json` + +Default ranker config for **Russian** language is `deeppavlov/configs/ranking/ru_ranker_tfidf_wiki.json` + +The ranker config for **English** language can be found at `deeppavlov/configs/odqa/en_ranker_prod.json` + +The ranker config for **Russian** language can be found at `deeppavlov/configs/odqa/ru_ranker_prod.json` + +* **dataset_iterator** - downloads Wikipidia DB, creates batches for ranker fitting + * **_data_dir_** - a directory to download DB to + * **_data_url_** - an URL to download Wikipedia DB from + * **_shuffle_** - whether to perform shuffling when iterating over DB or not +* **chainer** - pipeline manager + * **_in_** - pipeline input data (questions) + * **_out_** - pipeline output data (Wikipedia articles ids and scores of the articles) +* **tfidf_ranker** - the ranker class + * **top_n** - a number of document to return (when n=1 the most relevant document is returned) + * **_in_** - ranker input data (questions) + * **_out_** - ranker output data (Wikipedia articles ids) + * **_fit_on_batch_** - fit the ranker on batches of Wikipedia articles + * **_vectorizer_** - a vectorizer class + * **_fit_on_batch_** - fit the vectorizer on batches of Wikipedia articles + * **_save_path_** - a path to serialize a vectorizer to + * **_load_path_** - a path to load a vectorizer from + * **_tokenizer_** - a tokenizer class + * **_lemmas_** - whether to lemmatize tokens or not + * **_ngram_range_** - ngram range for vectorizer features +* **train** - parameters for vectorizer fitting + * **_validate_best_**- is ingnored, any value + * **_test_best_** - is ignored, any value + * **_batch_size_** - how many Wikipedia articles should return the dataset iterator in a single batch + +## Running the ranker + +**Training and infering the rannker on English Wikipedia requires 16 GB RAM** + +## Training + +Run the following to fit the ranker on **English** Wikipedia: +```bash +cd deeppavlov/ +python deep.py train deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json +``` +Run the following to fit the ranker on **Russian** Wikipedia: +```bash +cd deeppavlov/ +python deep.py train deeppavlov/configs/ranking/ru_ranker_tfidf_wiki.json +``` + +## Interacting + +When interacted the ranker returns document titles of the relevant documents. + +Run the following to interact the **English** ranker: +```bash +cd deeppavlov/ +python deep.py interact deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json -d +``` +Run the following to interact the **Russian** ranker: +```bash +cd deeppavlov/ +python deep.py interact deeppavlov/configs/ranking/ru_ranker_tfidf_wiki.json -d +``` + +## Pretrained models + +Wikipedia DB and pretrained tfidf matrices are downloaded in `deeppavlov/download/odqa` by default. + +### enwiki.db + +**enwiki.db** SQLite database consists of **5159530 Wikipedia articles** +and is built by the following steps: +1. Download a Wikipedia dump file. We took the latest [enwiki](https://dumps.wikimedia.org/enwiki/20180201) + (from 2018-02-11) +2. Unpack and extract the articles with [WikiExtractor](https://github.com/attardi/wikiextractor) + (with `--json`, `--no-templates`, `--filter_disambig_pages` options) +3. Build a database with the help of [DrQA script](https://github.com/facebookresearch/DrQA/blob/master/scripts/retriever/build_db.py). + +### enwiki_tfidf_matrix.npz + + **enwiki_tfidf_matrix.npz** is a full Wikipedia tf-idf matrix of size `hash_size x number of documents` which is + `2**24 x 5159530`. This matrix is built with `deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.HashingTfidfVectorizer` + class. + +### ruwiki.db + +**ruwiki.db** SQLite database consists of **1463888 Wikipedia articles** +and is built by the following steps: +1. Download a Wikipedia dump file. We took the latest [ruwiki](https://dumps.wikimedia.org/ruwiki/20180401) +(from 2018-04-01) +2. Unpack and extract the articles with [WikiExtractor](https://github.com/attardi/wikiextractor) +(with `--json`, `--no-templates`, `--filter_disambig_pages` options) +3. Build a database with the help of [DrQA script](https://github.com/facebookresearch/DrQA/blob/master/scripts/retriever/build_db.py). + +### ruwiki_tfidf_matrix.npz + + **ruwiki_tfidf_matrix.npz** is a full Wikipedia tf-idf matrix of size `hash_size x number of documents` which is + `2**24 x 1463888`. This matrix is built with `deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.HashingTfidfVectorizer` + class. + + +## References + +1. https://github.com/facebookresearch/DrQA \ No newline at end of file diff --git a/deeppavlov/skills/odqa/tfidf_ranker.py b/deeppavlov/models/ranking/tfidf_ranker.py similarity index 100% rename from deeppavlov/skills/odqa/tfidf_ranker.py rename to deeppavlov/models/ranking/tfidf_ranker.py diff --git a/deeppavlov/skills/seq2seq_go_bot/README.md b/deeppavlov/models/seq2seq_go_bot/README.md similarity index 100% rename from deeppavlov/skills/seq2seq_go_bot/README.md rename to deeppavlov/models/seq2seq_go_bot/README.md diff --git a/deeppavlov/skills/go_bot/__init__.py b/deeppavlov/models/seq2seq_go_bot/__init__.py similarity index 100% rename from deeppavlov/skills/go_bot/__init__.py rename to deeppavlov/models/seq2seq_go_bot/__init__.py diff --git a/deeppavlov/skills/seq2seq_go_bot/bot.py b/deeppavlov/models/seq2seq_go_bot/bot.py similarity index 96% rename from deeppavlov/skills/seq2seq_go_bot/bot.py rename to deeppavlov/models/seq2seq_go_bot/bot.py index 952905a36a..f4527ea45b 100644 --- a/deeppavlov/skills/seq2seq_go_bot/bot.py +++ b/deeppavlov/models/seq2seq_go_bot/bot.py @@ -15,14 +15,12 @@ """ import itertools -import numpy as np from typing import Type from deeppavlov.core.common.registry import register from deeppavlov.core.models.nn_model import NNModel from deeppavlov.core.data.vocab import DefaultVocabulary -from deeppavlov.models.embedders.fasttext_embedder import FasttextEmbedder -from deeppavlov.skills.seq2seq_go_bot.network import Seq2SeqGoalOrientedBotNetwork +from deeppavlov.models.seq2seq_go_bot.network import Seq2SeqGoalOrientedBotNetwork from deeppavlov.core.common.log import get_logger diff --git a/deeppavlov/skills/seq2seq_go_bot/kb.py b/deeppavlov/models/seq2seq_go_bot/kb.py similarity index 100% rename from deeppavlov/skills/seq2seq_go_bot/kb.py rename to deeppavlov/models/seq2seq_go_bot/kb.py diff --git a/deeppavlov/skills/seq2seq_go_bot/network.py b/deeppavlov/models/seq2seq_go_bot/network.py similarity index 99% rename from deeppavlov/skills/seq2seq_go_bot/network.py rename to deeppavlov/models/seq2seq_go_bot/network.py index 1456c28f00..30deaea262 100644 --- a/deeppavlov/skills/seq2seq_go_bot/network.py +++ b/deeppavlov/models/seq2seq_go_bot/network.py @@ -16,7 +16,6 @@ import json import tensorflow as tf -from tensorflow.contrib.layers import xavier_initializer from deeppavlov.core.common.registry import register from deeppavlov.core.common.errors import ConfigError diff --git a/deeppavlov/models/tokenizers/ru_sent_tokenizer.py b/deeppavlov/models/tokenizers/ru_sent_tokenizer.py new file mode 100644 index 0000000000..3c3e733b14 --- /dev/null +++ b/deeppavlov/models/tokenizers/ru_sent_tokenizer.py @@ -0,0 +1,31 @@ +from typing import Set, Tuple + +from deeppavlov.core.models.component import Component +from deeppavlov.core.common.registry import register +from rusenttokenize import ru_sent_tokenize, SHORTENINGS, JOINING_SHORTENINGS, PAIRED_SHORTENINGS + + +@register("ru_sent_tokenizer") +class RuSentTokenizer(Component): + """ + Rule-base sentence tokenizer for Russian language. + https://github.com/deepmipt/ru_sentence_tokenizer + """ + def __init__(self, shortenings: Set[str] = SHORTENINGS, + joining_shortenings: Set[str] = JOINING_SHORTENINGS, + paired_shortenings: Set[Tuple[str, str]] = PAIRED_SHORTENINGS): + """ + Args: + shortenings: list of known shortenings. Use default value if working on news or fiction texts + joining_shortenings: list of shortenings after that sentence split is not possible (i.e. "ул"). + Use default value if working on news or fiction texts + paired_shortenings: list of known paired shotenings (i.e. "т. е."). + Use default value if working on news or fiction texts + + """ + self.shortenings = shortenings + self.joining_shortenings = joining_shortenings + self.paired_shortenings = paired_shortenings + + def __call__(self, batch: [str]) -> [[str]]: + return [ru_sent_tokenize(x, self.shortenings, self.joining_shortenings, self.paired_shortenings) for x in batch] diff --git a/deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py b/deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py index de29bad89e..4a0a464792 100644 --- a/deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py +++ b/deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.py @@ -22,7 +22,6 @@ import numpy as np from sklearn.utils import murmurhash3_32 -from deeppavlov.models.tokenizers.spacy_tokenizer import StreamSpacyTokenizer from deeppavlov.core.models.component import Component from deeppavlov.core.models.serializable import Serializable from deeppavlov.core.common.log import get_logger @@ -42,7 +41,7 @@ class HashingTfIdfVectorizer(Component, Serializable): Create a tfidf matrix from collection of documents. """ - def __init__(self, hash_size=2 ** 24, tokenizer: Type = StreamSpacyTokenizer, doc_index: dict =None, + def __init__(self, tokenizer, hash_size=2 ** 24, doc_index: dict =None, save_path: str = None, load_path: str = None, **kwargs): """ diff --git a/deeppavlov/package_meta.py b/deeppavlov/package_meta.py index 9259e7bbc8..bd582e2bbc 100644 --- a/deeppavlov/package_meta.py +++ b/deeppavlov/package_meta.py @@ -1,2 +1,2 @@ -__version__ = '0.0.5.3' +__version__ = '0.0.6' __author__ = 'Neural Networks and Deep Learning lab, MIPT' diff --git a/deeppavlov/run_model.py b/deeppavlov/run_model.py index ab885fe764..4db87b0f0b 100644 --- a/deeppavlov/run_model.py +++ b/deeppavlov/run_model.py @@ -38,5 +38,7 @@ # PIPELINE_CONFIG_PATH = 'configs/odqa/ru_odqa_infer_prod.json' # PIPELINE_CONFIG_PATH = 'configs/odqa/ranker_test.json' -train_evaluate_model_from_config(PIPELINE_CONFIG_PATH) -# interact_model(PIPELINE_CONFIG_PATH) + +if __name__ == '__main__': + train_evaluate_model_from_config(PIPELINE_CONFIG_PATH) + # interact_model(PIPELINE_CONFIG_PATH) diff --git a/deeppavlov/skills/odqa/README.md b/deeppavlov/skills/odqa/README.md index a87ec8c839..bd9be3aefa 100644 --- a/deeppavlov/skills/odqa/README.md +++ b/deeppavlov/skills/odqa/README.md @@ -24,116 +24,36 @@ and its [implementation](https://github.com/HKUST-KnowComp/R-Net) by Wenxuan Zho ## Running ODQA -**Tensorflow-1.4.0 with GPU support is required** to run this model. +**Tensorflow-1.8.0 with GPU support is required** to run this model. + +**About 16 GB of RAM required** ## Training The ODQA ranker and ODQA reader should be trained separately. -**Warning: training the ranker on English Wikipedia requires 16 GB RAM.** Run the following to fit the ranker: -```bash -python -m deeppavlov train deeppavlov/configs/odqa/en_ranker_prod.json -``` -Read about training the reader in our separate [reader tutorial](https://github.com/deepmipt/DeepPavlov/tree/master/deeppavlov/models/squad). +Read about training the **ranker** in our separate [ranker tutorial](https://github.com/deepmipt/DeepPavlov/tree/master/deeppavlov/models/ranking/README_TFIDF.md). +Read about training the **reader** in our separate [reader tutorial](https://github.com/deepmipt/DeepPavlov/tree/master/deeppavlov/models/squad). ## Interacting -ODQA, reader and ranker can be interacted separately. **Warning: interacting the ranker and ODQA on English Wikipedia requires 16 GB RAM.** -Run the following to interact ODQA: +When interacted, the ODQA model returns a plain answer to the user's question. + +Run the following to interact **English** ODQA: ```bash -python -m deeppavlov train deeppavlov/configs/odqa/en_odqa_infer_prod.json +cd deeppavlov/ +python deep.py interact deeppavlov/configs/odqa/en_odqa_infer_wiki.json -d ``` Run the following to interact the ranker: ```bash -python -m deeppavlov interact deeppavlov/configs/odqa/en_ranker_prod.json +cd deeppavlov/ +python deep.py interact deeppavlov/configs/odqa/ru_odqa_infer_wiki.json -d ``` -Read about interacting the reader in our separate [reader tutorial](https://github.com/deepmipt/DeepPavlov/tree/master/deeppavlov/models/squad). ## Configuration The ODQA configs suit only model inferring purposes. The [ranker config](#the-ranker-config) should be used for ranker training and the [reader config](https://github.com/deepmipt/DeepPavlov/tree/master/deeppavlov/models/squad#config-components) should be used for reader training. -### Ranker - -The ranker config for **English** language can be found at `deeppavlov/configs/odqa/en_ranker_prod.json` - -The ranker config for **Russian** language can be found at `deeppavlov/configs/odqa/ru_ranker_prod.json` - -* **dataset_iterator** - downloads Wikipidia DB, creates batches for ranker fitting - * **_data_dir_** - a directory to download DB to - * **_data_url_** - an URL to download Wikipedia DB from - * **_shuffle_** - whether to perform shuffling when iterating over DB or not -* **chainer** - pipeline manager - * **_in_** - pipeline input data (questions) - * **_out_** - pipeline output data (Wikipedia articles ids and scores of the articles) -* **tfidf_ranker** - the ranker class - * **_in_** - ranker input data (questions) - * **_out_** - ranker output data (Wikipedia articles ids) - * **_fit_on_batch_** - fit the ranker on batches of Wikipedia articles - * **_vectorizer_** - a vectorizer class - * **_fit_on_batch_** - fit the vectorizer on batches of Wikipedia articles - * **_save_path_** - a path to serialize a vectorizer to - * **_load_path_** - a path to load a vectorizer from - * **_tokenizer_** - a tokenizer class - * **_lemmas_** - whether to lemmatize tokens or not - * **_ngram_range_** - ngram range for vectorizer features -* **train** - parameters for vectorizer fitting - * **_validate_best_**- is ingnored, any value - * **_test_best_** - is ignored, any value - * **_batch_size_** - how many Wikipedia articles should return the dataset iterator in a single batch - -### ODQA - -Default ODQA config for **English** language is `deeppavlov/configs/odqa/en_odqa_infer_prod.json` - -Default ODQA config for **Russian** language is `deeppavlov/configs/odqa/ru_odqa_infer_prod.json` - -The components of ODQA config can be referred to [ranker config](#the-ranker-config) -and [reader config](https://github.com/deepmipt/DeepPavlov/tree/master/deeppavlov/models/squad#config-components) accordingly. -However, main inputs and outputs are worth explaining: - -* **chainer** - pipeline manager - * **_in_** - pipeline input data (questions) - * **_out_** - pipeline output data (answers) - -## Pretrained models - -Wikipedia data and pretrained ODQA models are downloaded in `deeppavlov/download/odqa` by default. - -### enwiki.db - -**enwiki.db** SQLite database consists of **5159530 Wikipedia articles** -and is built by the following steps: -1. Download a Wikipedia dump file. We took the latest [enwiki](https://dumps.wikimedia.org/enwiki/20180201) - (from 2018-02-11) -2. Unpack and extract the articles with [WikiExtractor](https://github.com/attardi/wikiextractor) - (with `--json`, `--no-templates`, `--filter_disambig_pages` options) -3. Build a database with the help of [DrQA script](https://github.com/facebookresearch/DrQA/blob/master/scripts/retriever/build_db.py). - -### enwiki_tfidf_matrix.npz - - **enwiki_tfidf_matrix.npz** is a full Wikipedia tf-idf matrix of size `hash_size x number of documents` which is - `2**24 x 5159530`. This matrix is built with `deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.HashingTfidfVectorizer` - class. - -### ruwiki.db - -**ruwiki.db** SQLite database consists of **1463888 Wikipedia articles** -and is built by the following steps: -1. Download a Wikipedia dump file. We took the latest [ruwiki](https://dumps.wikimedia.org/ruwiki/20180401) -(from 2018-04-01) -2. Unpack and extract the articles with [WikiExtractor](https://github.com/attardi/wikiextractor) -(with `--json`, `--no-templates`, `--filter_disambig_pages` options) -3. Build a database with the help of [DrQA script](https://github.com/facebookresearch/DrQA/blob/master/scripts/retriever/build_db.py). - -### ruwiki_tfidf_matrix.npz - - **ruwiki_tfidf_matrix.npz** is a full Wikipedia tf-idf matrix of size `hash_size x number of documents` which is - `2**24 x 1463888`. This matrix is built with `deeppavlov/models/vectorizers/hashing_tfidf_vectorizer.HashingTfidfVectorizer` - class. - - ## References - 1. https://github.com/facebookresearch/DrQA 2. https://github.com/HKUST-KnowComp/R-Net \ No newline at end of file diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000000..3e631e577a --- /dev/null +++ b/examples/README.md @@ -0,0 +1,30 @@ +# This is "Hello world!" example of simple bot implemented in DeepPavlov + +Import key components to build HelloBot. +```python +from deeppavlov.core.agent import Agent, HighestConfidenceSelector +from deeppavlov.skills.pattern_matching_skill import PatternMatchingSkill +``` + +Create skills as pre-defined responses for a user's input containing specific keywords. Every skill returns response and confidence. +```python +hello = PatternMatchingSkill(responses=['Hello world! :)'], patterns=["hi", "hello", "good day"]) +bye = PatternMatchingSkill(['Goodbye world! :(', 'See you around.'], ["bye", "chao", "see you"]) +fallback = PatternMatchingSkill(["I don't understand, sorry :/", 'I can say "Hello world!" 8)']) +``` + +Agent executes skills and then takes response from the skill with the highest confidence. +```python +HelloBot = Agent([hello, bye, fallback], skills_selector=HighestConfidenceSelector()) +``` + +Give the floor to the HelloBot! +```python +print(HelloBot(['Hello!', 'Boo...', 'Bye.'])) +``` + +[Jupyther notebook with HelloBot example.](hello_bot.ipynb) + +# Tutorials + +To dive deeper into DeepPavlov work through [tutorials](tutorials/). diff --git a/examples/hello_agent.ipynb b/examples/hello_bot.ipynb similarity index 56% rename from examples/hello_agent.ipynb rename to examples/hello_bot.ipynb index 7510ef6f6e..87c6cfdded 100644 --- a/examples/hello_agent.ipynb +++ b/examples/hello_bot.ipynb @@ -4,29 +4,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/yoptar/reps/DeepPavlov/venv/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", - " from ._conv import register_converters as _register_converters\n", - "Using TensorFlow backend.\n", - "[nltk_data] Downloading package punkt to /home/yoptar/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n", - "[nltk_data] Downloading package stopwords to /home/yoptar/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n", - "[nltk_data] Downloading package perluniprops to\n", - "[nltk_data] /home/yoptar/nltk_data...\n", - "[nltk_data] Package perluniprops is already up-to-date!\n", - "[nltk_data] Downloading package nonbreaking_prefixes to\n", - "[nltk_data] /home/yoptar/nltk_data...\n", - "[nltk_data] Package nonbreaking_prefixes is already up-to-date!\n", - "2018-06-22 11:47:45.69 DEBUG in 'gensim.models.doc2vec'['doc2vec'] at line 73: Fast version of gensim.models.doc2vec is being used\n", - "2018-06-22 11:47:45.73 INFO in 'summa.preprocessing.cleaner'['textcleaner'] at line 20: 'pattern' package not found; tag filters are not available for English\n" - ] - } - ], + "outputs": [], "source": [ "from deeppavlov.skills.pattern_matching_skill import PatternMatchingSkill\n", "from deeppavlov.core.agent import Agent, HighestConfidenceSelector" @@ -38,7 +16,7 @@ "metadata": {}, "outputs": [], "source": [ - "hello = PatternMatchingSkill(['Hello world!'], patterns=[\"hi\", \"hello\", \"good day\"])\n", + "hello = PatternMatchingSkill(responses=['Hello world!'], patterns=[\"hi\", \"hello\", \"good day\"])\n", "bye = PatternMatchingSkill(['Goodbye world!', 'See you around'],\n", " patterns=[\"bye\", \"chao\", \"see you\"])\n", "fallback = PatternMatchingSkill([\"I don't understand, sorry\", 'I can say \"Hello world!\"'])" diff --git a/examples/hello_bot.py b/examples/hello_bot.py new file mode 100644 index 0000000000..90c3630188 --- /dev/null +++ b/examples/hello_bot.py @@ -0,0 +1,17 @@ +# This is "Hello world!" example of simple bot implemented in DeepPavlov. +# +# Imports key components to build HelloBot. +from deeppavlov.core.agent import Agent, HighestConfidenceSelector +from deeppavlov.skills.pattern_matching_skill import PatternMatchingSkill + +# Creates skills as pre-defined responses for a user's input containing specific keywords. +# Every skill returns response and confidence. +hello = PatternMatchingSkill(responses=['Hello world! :)'], patterns=["hi", "hello", "good day"]) +bye = PatternMatchingSkill(['Goodbye world! :(', 'See you around.'], ["bye", "chao", "see you"]) +fallback = PatternMatchingSkill(["I don't understand, sorry :/", 'I can say "Hello world!" 8)']) + +# Agent executes skills and then takes response from the skill with the highest confidence. +HelloBot = Agent([hello, bye, fallback], skills_selector=HighestConfidenceSelector()) + +# Give the floor to the HelloBot! +print(HelloBot(['Hello!', 'Boo...', 'Bye.'])) \ No newline at end of file diff --git a/examples/tutorials/00_deeppavlov_intro.ipynb b/examples/tutorials/00_deeppavlov_intro.ipynb new file mode 100644 index 0000000000..02141e21d1 --- /dev/null +++ b/examples/tutorials/00_deeppavlov_intro.ipynb @@ -0,0 +1,213 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to DeepPavlov\n", + "In this tutorial we will learn how to install and construct a simple bot based on pattern matching and the library abstactions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Libraries\n", + "\n", + "For this task you will need the following libraries:\n", + " - [Tensorflow](https://www.tensorflow.org) — an open-source software library for Machine Intelligence.\n", + " - [Numpy](http://www.numpy.org) — a package for scientific computing.\n", + " - [DeepPavlov](https://github.com/deepmipt/deeppavlov) - open source library for Natural Language Processing\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Installation of DeepPavlov library\n", + "\n", + "Currently only Linux platform and Python 3.6 is supported\n", + "\n", + "- Create a virtual environment with Python 3.6\n", + "\n", + " `virtualenv -p python3.6 env`\n", + "\n", + "- Activate the environment.\n", + "\n", + " `source ./env/bin/activate`\n", + "\n", + "- Clone the repo and cd to project root\n", + "\n", + " `git clone https://github.com/deepmipt/DeepPavlov.git`\n", + " \n", + " `cd DeepPavlov`\n", + "\n", + "- Install basic requirements:\n", + "\n", + " `python setup.py develop`\n", + "\n", + "- Install packages required for these tutorials:\n", + "\n", + " `python -m deeppavlov install gobot_dstc2`\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Install the library on Windows using Docker\n", + "\n", + "First, install the Docker following these instructions:\n", + "\n", + "https://docs.docker.com/docker-for-windows/install\n", + "\n", + "Then go to console and get the container with the following command:\n", + "\n", + "`docker pull altinsky/convai:deeppavlov`\n", + "\n", + "Run the container:\n", + "\n", + "`docker run -p 8888:8888 altinsky/convai:deeppavlov`\n", + "\n", + "Navigate to http://127.0.0.1:8888/ in your browser.\n", + "\n", + "To STOP the container run:\n", + "\n", + "`docker stop`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HelloBot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this part we will construct a simple bot that relies on pattern matching to perform a conversation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from deeppavlov.skills.pattern_matching_skill import PatternMatchingSkill\n", + "from deeppavlov.core.agent import Agent, HighestConfidenceSelector" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A pattern matching skill is the simplest example of Natural Language Understanding component. It will search defined patterns through the text. Let's define some simple patterns:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "hello = PatternMatchingSkill(responses=['Hello world! :)'], patterns=[\"hi\", \"hello\", \"good day\"])\n", + "bye = PatternMatchingSkill(['Goodbye world! :(', 'See you around.'], [\"bye\", \"chao\", \"see you\"])\n", + "fallback = PatternMatchingSkill([\"I don't understand, sorry :/\", 'I can say \"Hello world!\" 8)'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you provide some patterns to the PatternMatchingSkill it will return confidence = 1 when the skill finds the pattern in given text. If no patterns is provided then confidence 0.5 will be returned in any case." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The skills are used in the `Agent` which can be treated as a Dialog Manager. The agent must be provided with skills and the selector of skills. A simple skill selector is the HighestConfidenceSelector which will pick the skill with highest confidence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "HelloBot = Agent([hello, bye, fallback], skills_selector=HighestConfidenceSelector())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since all processing in the library is performed on batches, we can pass a batch of requests to the bot. Let's try it out:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "HelloBot(['Hello!', 'Boo...', 'Bye.'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise** \n", + "- create a WhatIsYourName skill\n", + "- create new agent with this skill\n", + "- check that all works fine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "name = # YOUR_CODE_HERE" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [Root]", + "language": "python", + "name": "Python [Root]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/tutorials/00_deeppavlov_intro.pdf b/examples/tutorials/00_deeppavlov_intro.pdf new file mode 100644 index 0000000000..8e2c7b5761 Binary files /dev/null and b/examples/tutorials/00_deeppavlov_intro.pdf differ diff --git a/examples/tutorials/01_deeppavlov_data.ipynb b/examples/tutorials/01_deeppavlov_data.ipynb new file mode 100644 index 0000000000..d9a3579750 --- /dev/null +++ b/examples/tutorials/01_deeppavlov_data.ipynb @@ -0,0 +1,508 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data preparation in DeepPavlov\n", + "Learn how to read and prepare data for trainable components." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data\n", + "Deeppavlov library has functionality to download and decompress the data. For this purpose the `download_decompress` from `data.utils` is used. \n", + "The following cell will download the CoNLL-2003 data for the Named Entity Recognition (NER) task and put it to the folder `data/`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import deeppavlov\n", + "from deeppavlov.core.data.utils import download_decompress\n", + "download_decompress('http://lnsigo.mipt.ru/export/deeppavlov_data/conll2003_v2.tar.gz', 'data/')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parsing text data into a machine-readable dataset \n", + "\n", + "We will work with a corpus which contains tweets with NE tags. A typical file with NER data contains lines with pairs of tokens (word or punctuation symbol) and tags separated by a whitespace. In many cases additional information such as POS-tags is included. \n", + "\n", + "Different documents are separated by lines **started** with **-DOCSTART-** token. Different sentences are separated by an empty line. Example:\n", + "\n", + " -DOCSTART- -X- -X- O\n", + "\n", + " EU NNP B-NP B-ORG\n", + " rejects VBZ B-VP O\n", + " German JJ B-NP B-MISC\n", + " call NN I-NP O\n", + " to TO B-VP O\n", + " boycott VB I-VP O\n", + " British JJ B-NP B-MISC\n", + " lamb NN I-NP O\n", + " . . O O\n", + "\n", + " Peter NNP B-NP B-PER\n", + " Blackburn NNP I-NP I-PER\n", + "\n", + "In this tutorial we will focus only on tokens and tags (first and last elements of the line) and drop POS information located between them.\n", + "\n", + "We start by building a class *NerDatasetReader* that provides functionality for reading the dataset. It returns a dictionary with fields *train*, *test*, and *valid*. Each field stores a list of samples. Each sample is a tuple of tokens and tags. Both tokens and tags are lists. The following example depicts the structure that should be returned by *read* method:\n", + "\n", + " {'train': [(['Mr.', 'Dwag', 'are', 'derping', 'around'], ['B-PER', 'I-PER', 'O', 'O', 'O']), ....],\n", + " 'valid': [...],\n", + " 'test': [...]}\n", + "\n", + "There are three separate parts in the dataset:\n", + " - *train* data for training the model;\n", + " - *validation* data for evaluation and hyperparameters tuning;\n", + " - *test* data for final evaluation of the model.\n", + " \n", + "\n", + "Each of these parts is stored in a separate txt file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "class NerDatasetReader:\n", + " def read(self, data_path):\n", + " data_parts = ['train', 'valid', 'test']\n", + " extension = '.txt'\n", + " dataset = {}\n", + " for data_part in data_parts:\n", + " file_path = Path(data_path) / Path(data_part + extension)\n", + " dataset[data_part] = self.read_file(str(file_path))\n", + " return dataset\n", + " \n", + " @staticmethod\n", + " def read_file(file_path):\n", + " \n", + " # Use utf-8 encoding when open the file\n", + " ######################################\n", + " ########## YOUR CODE HERE ############\n", + " ######################################\n", + " return samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "dataset_reader = NerDatasetReader()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "dataset = dataset_reader.read('data/')\n", + "assert len(dataset) == 3, 'The dataset must be a dict with three fields: train, test, and valid'\n", + "assert len(set(dataset) & {'train', 'test', 'valid'}) == 3, 'The dataset keys must be exactly train, test, and valid'\n", + "assert isinstance(dataset['train'][0][0][0], str) and isinstance(dataset['train'][0][0][1], str), 'Both tokens and tags must be strings'\n", + "assert len(dataset['train']) == 14041, 'there must be exactly 14041 samples in train'\n", + "assert len(dataset['valid']) == 3250, 'there must be exactly 3250 samples in train'\n", + "assert len(dataset['test']) == 3453, 'there must be exactly 3453 samples in test'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should always understand what kind of data you deal with. For this purpose, you can print the data by running the code in the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "for sample in dataset['train'][:2]:\n", + " for token, tag in zip(*sample):\n", + " print('%s\\t%s' % (token, tag))\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can find an implementation of the dataset reader that implemets the same interfaces in the library: [Conll2003DatasetReader](https://github.com/deepmipt/DeepPavlov/blob/dev/deeppavlov/dataset_readers/conll2003_reader.py). The functionality of the presented code is wider and the `register` wrapper allows to use this component as a part of config file (will be discussed later)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare dictionaries\n", + "\n", + "To train a neural network, we will use two mappings: \n", + "- {token}$\\to${token id}: address the row in embeddings matrix for the current token;\n", + "- {tag}$\\to${tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.\n", + "\n", + "Token indices will be used to address a row in embeddings matrix. The mapping for tags will be used to create one-hot ground-truth probability distribution vectors to compute the loss at the output of the network.\n", + "\n", + "Now you need to implement the *Vocab* class which will return {token or tag}$\\to${index} and vice versa. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from collections import defaultdict, Counter\n", + "from itertools import chain\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class Vocab:\n", + " def __init__(self,\n", + " special_tokens=tuple()):\n", + " self.special_tokens = special_tokens\n", + " self._t2i = defaultdict(lambda: 1)\n", + " self._i2t = []\n", + " \n", + " def fit(self, tokens):\n", + " count = 0\n", + " self.freqs = Counter(chain(*tokens))\n", + " # The first special token will be the default token\n", + " for special_token in self.special_tokens:\n", + " self._t2i[special_token] = count\n", + " self._i2t.append(special_token)\n", + " count += 1\n", + " for token, freq in self.freqs.most_common():\n", + " if token not in self._t2i:\n", + " self._t2i[token] = count\n", + " self._i2t.append(token)\n", + " count += 1\n", + "\n", + " def __call__(self, batch, **kwargs):\n", + " # Implement the vocab() method. The input could be a batch of tokens\n", + " # or a batch of indices. A batch is a list of utterances where each\n", + " # utterance is a list of tokens\n", + " pass\n", + " ######################################\n", + " ########## YOUR CODE HERE ############\n", + " ######################################\n", + "\n", + " def __getitem__(self, key):\n", + " # Implement the vocab[] method. The input could be a token\n", + " # (string) or an index. You have to detect what type of data\n", + " # is key and return. \n", + " pass\n", + " ######################################\n", + " ########## YOUR CODE HERE ############\n", + " ######################################\n", + " \n", + " def __len__(self):\n", + " return len(self._i2t)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After implementing the function *build_dict* you can make dictionaries for tokens and tags. Special tokens in our case will be:\n", + " - `` token for out of vocabulary tokens\n", + " - `'O'` for the tag vocab to place out of label tag to the first place with index 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "special_tokens = ['']\n", + "special_tags = ['O']\n", + "\n", + "token_vocab = Vocab(special_tokens)\n", + "tag_vocab = Vocab(special_tags)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will fit the vocabularies on the *train* part of the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "all_tokens_by_sentenses = [tokens for tokens, tags in dataset['train']]\n", + "all_tags_by_sentenses = [tags for tokens, tags in dataset['train']]\n", + "\n", + "token_vocab.fit(all_tokens_by_sentenses)\n", + "tag_vocab.fit(all_tags_by_sentenses)\n", + "\n", + "assert len(token_vocab) == 23624, 'There must be exactly 23624 in the token vocab!'\n", + "assert len(tag_vocab) == 9, 'There must be exactly 9 in the tag vocab!'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Try to get the indices. Keep in mind that we are working with batches of the following structure:\n", + " \n", + " [['utt0_tok0', 'utt1_tok1', ...], ['utt1_tok0', 'utt1_tok1', ...], ...]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "indices_batch = token_vocab([['How', 'to', 'cook', 'a', 'turnip', '?']])\n", + "\n", + "assert len(indices_batch) == 1, 'the batch length must be 1'\n", + "assert isinstance(indices_batch[0][0], int), 'The batch must contain lists of ints!'\n", + "\n", + "print(indices_batch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "tag_indices_batch = tag_vocab([['O', 'O', 'O'], ['B-PER']])\n", + "\n", + "assert len(tag_indices_batch) == 2, 'the batch length must be 2'\n", + "assert isinstance(tag_indices_batch[0][0], int), 'The batch must contain lists of ints!'\n", + "\n", + "print(tag_indices_batch)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will try converting from indices to tokens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "token_vocab([np.random.randint(0, 512, size=10)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A similar vocabulary is already implemented in the [library](https://github.com/deepmipt/DeepPavlov/blob/dev/deeppavlov/core/data/simple_vocab.py). It has extended functionality:\n", + "- token cutoff by frequency\n", + "- limitation of the vocabulary size\n", + "- saving and loading\n", + "- dict like dunders (\\_\\_contain\\_\\_, \\_\\_len\\_\\_, etc.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset Iterator\n", + "\n", + "Neural Networks are usually trained with batches. It means that weight updates of the network are based on several sequences at every single time. You have to iterate over the dataset and generate `x` and `y` batch by batch. The batch of `x`-s is a list of sentences of tokens like\n", + "\n", + " [['Yan', 'is', 'a', 'good', 'fellow],\n", + " ['For', 'instance']]\n", + "\n", + "and the tag sequence should be:\n", + "\n", + " [['B-PER', 'O', 'O', 'O', 'O'],\n", + " ['O', 'O']]\n", + "\n", + "An important concept in the batch generation is shuffling. Shuffling is taking sample from the dataset at random order. It is important to train on the shuffled data because large number consequetive samples of the same class may result in pure quality of the model.\n", + " \n", + "The idea behind the iterator is to perform computation in the lazy way. Use yield generator expression to do so. An example of using yield for generator creation is provided below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def iterator():\n", + " data = [1, 2, 3]\n", + " for d in data:\n", + " yield d\n", + " \n", + "print(iterator)\n", + " \n", + "for i in iterator():\n", + " print(i)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now create the `DatasetIterator`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class DatasetIterator:\n", + " def __init__(self, data):\n", + " self.data = {\n", + " 'train': data['train'],\n", + " 'valid': data['valid'],\n", + " 'test': data['test']\n", + " }\n", + "\n", + " def gen_batches(self, batch_size, data_type='train', shuffle=True):\n", + " ######################################\n", + " ########## YOUR CODE HERE ############\n", + " ######################################\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the dataset iterator from the loaded dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data_iterator = DatasetIterator(dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Try it out:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "x, y = next(data_iterator.gen_batches(2))\n", + "\n", + "assert len(x) == 2, 'There must be two examples in the batch!'\n", + "assert len(y) == 2, 'There must be two examples in the batch!'\n", + "assert len(x[0]) == len(y[0]), 'The numbers of tokens and tags are different!'\n", + "assert isinstance(x[0][0], str), 'Token must be a string!'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a typical part of the data preprocessing pipeline. This parts will be used in the following tutorials. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [Root]", + "language": "python", + "name": "Python [Root]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/tutorials/02_deeppavlov_ner.ipynb b/examples/tutorials/02_deeppavlov_ner.ipynb new file mode 100644 index 0000000000..59419eadba --- /dev/null +++ b/examples/tutorials/02_deeppavlov_ner.ipynb @@ -0,0 +1,1607 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Recognize named entities on news data with CNN\n", + "\n", + "In this tutorial, you will use a convolutional neural network to solve Named Entity Recognition (NER) problem. NER is a common task in natural language processing systems. It serves for extraction such entities from the text as persons, organizations, locations, etc. In this task you will experiment to recognize named entities in different news from common CoNLL-2003 dataset.\n", + "\n", + "For example, we want to extract persons' and organizations' names from the text. Then for the input text:\n", + "\n", + " Yan Goodfellow works for Google Brain\n", + "\n", + "a NER model needs to provide the following sequence of tags:\n", + "\n", + " B-PER I-PER O O B-ORG I-ORG\n", + "\n", + "Where *B-* and *I-* prefixes stand for the beginning and inside of the entity, while *O* stands for out of tag or no tag. Markup with the prefix scheme is called *BIO markup*. This markup is introduced for distinguishing of consequent entities with similar types.\n", + "\n", + "A solution of the task will be based on neural networks, particularly, on Convolutional Neural Networks." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data\n", + "\n", + "The following cell will download all data required for this assignment into the folder `/data`. The download util from the library is used to download and extract the archive." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/mikhail/env/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", + " from ._conv import register_converters as _register_converters\n", + "Using TensorFlow backend.\n", + "[nltk_data] Downloading package punkt to /home/mikhail/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] /home/mikhail/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package perluniprops to\n", + "[nltk_data] /home/mikhail/nltk_data...\n", + "[nltk_data] Package perluniprops is already up-to-date!\n", + "[nltk_data] Downloading package nonbreaking_prefixes to\n", + "[nltk_data] /home/mikhail/nltk_data...\n", + "[nltk_data] Package nonbreaking_prefixes is already up-to-date!\n", + "2018-06-27 12:30:29.760 DEBUG in 'gensim.models.doc2vec'['doc2vec'] at line 73: Fast version of gensim.models.doc2vec is being used\n", + "2018-06-27 12:30:29.764 INFO in 'summa.preprocessing.cleaner'['textcleaner'] at line 20: 'pattern' package not found; tag filters are not available for English\n", + "2018-06-27 12:30:34.248 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 208: Starting new HTTP connection (1): lnsigo.mipt.ru\n", + "2018-06-27 12:30:34.251 DEBUG in 'urllib3.connectionpool'['connectionpool'] at line 396: http://lnsigo.mipt.ru:80 \"GET /export/deeppavlov_data/conll2003_v2.tar.gz HTTP/1.1\" 200 957092\n", + "2018-06-27 12:30:34.253 INFO in 'deeppavlov.core.data.utils'['utils'] at line 65: Downloading from http://lnsigo.mipt.ru/export/deeppavlov_data/conll2003_v2.tar.gz to /home/mikhail/Projects/tutorial/data/conll2003_v2.tar.gz\n", + "100%|██████████| 957k/957k [00:00<00:00, 39.0MB/s]\n", + "2018-06-27 12:30:34.279 INFO in 'deeppavlov.core.data.utils'['utils'] at line 149: Extracting data/conll2003_v2.tar.gz archive into data\n" + ] + } + ], + "source": [ + "import deeppavlov\n", + "from deeppavlov.core.data.utils import download_decompress\n", + "download_decompress('http://lnsigo.mipt.ru/export/deeppavlov_data/conll2003_v2.tar.gz', 'data/')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the CoNLL-2003 Named Entity Recognition corpus\n", + "\n", + "We will work with a corpus, which contains twits with NE tags. Typical file with NER data contains lines with pairs of tokens (word/punctuation symbol) and tags, separated by a whitespace. In many cases additional information such as POS tags included between Different documents are separated by lines **started** with **-DOCSTART-** token. Different sentences are separated by an empty line. Example\n", + "\n", + " -DOCSTART- -X- -X- O\n", + "\n", + " EU NNP B-NP B-ORG\n", + " rejects VBZ B-VP O\n", + " German JJ B-NP B-MISC\n", + " call NN I-NP O\n", + " to TO B-VP O\n", + " boycott VB I-VP O\n", + " British JJ B-NP B-MISC\n", + " lamb NN I-NP O\n", + " . . O O\n", + "\n", + " Peter NNP B-NP B-PER\n", + " Blackburn NNP I-NP I-PER\n", + "\n", + "In this tutorial we will focus only on tokens and tags (first and last elements of the line) and drop POS information located in between.\n", + "\n", + "We start with using the *Conll2003DatasetReader* class that provides functionality for reading the dataset. It returns a dictionary with fields *train*, *test*, and *valid*. At each field a list of samples is stored. Each sample is a tuple of tokens and tags. Both tokens and tags are lists. The following example depicts the structure that should be returned by *read* method:\n", + "\n", + " {'train': [(['Mr.', 'Dwag', 'are', 'derping', 'around'], ['B-PER', 'I-PER', 'O', 'O', 'O']), ....],\n", + " 'valid': [...],\n", + " 'test': [...]}\n", + "\n", + "There are three separate parts of the dataset:\n", + " - *train* data for training the model;\n", + " - *validation* data for evaluation and hyperparameters tuning;\n", + " - *test* data for final evaluation of the model.\n", + " \n", + "\n", + "Each of these parts is stored in a separate txt file.\n", + "\n", + "We will use [Conll2003DatasetReader](https://github.com/deepmipt/DeepPavlov/blob/master/deeppavlov/dataset_readers/conll2003_reader.py) from the library to read the data from text files to the format described above." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from deeppavlov.dataset_readers.conll2003_reader import Conll2003DatasetReader\n", + "dataset = Conll2003DatasetReader().read('data/')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should always understand what kind of data you deal with. For this purpose, you can print the data running the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EU\tB-ORG\n", + "rejects\tO\n", + "German\tB-MISC\n", + "call\tO\n", + "to\tO\n", + "boycott\tO\n", + "British\tB-MISC\n", + "lamb\tO\n", + ".\tO\n", + "\n", + "Peter\tB-PER\n", + "Blackburn\tI-PER\n", + "\n", + "BRUSSELS\tB-LOC\n", + "1996-08-22\tO\n", + "\n", + "The\tO\n", + "European\tB-ORG\n", + "Commission\tI-ORG\n", + "said\tO\n", + "on\tO\n", + "Thursday\tO\n", + "it\tO\n", + "disagreed\tO\n", + "with\tO\n", + "German\tB-MISC\n", + "advice\tO\n", + "to\tO\n", + "consumers\tO\n", + "to\tO\n", + "shun\tO\n", + "British\tB-MISC\n", + "lamb\tO\n", + "until\tO\n", + "scientists\tO\n", + "determine\tO\n", + "whether\tO\n", + "mad\tO\n", + "cow\tO\n", + "disease\tO\n", + "can\tO\n", + "be\tO\n", + "transmitted\tO\n", + "to\tO\n", + "sheep\tO\n", + ".\tO\n", + "\n" + ] + } + ], + "source": [ + "for sample in dataset['train'][:4]:\n", + " for token, tag in zip(*sample):\n", + " print('%s\\t%s' % (token, tag))\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare dictionaries\n", + "\n", + "To train a neural network, we will use two mappings: \n", + "- {token}$\\to${token id}: address the row in embeddings matrix for the current token;\n", + "- {tag}$\\to${tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.\n", + "\n", + "Token indices will be used to address the row in embeddings matrix. The mapping for tags will be used to create one-hot ground truth probability distribution vectors to compute the loss at the output of the network.\n", + "\n", + "The [SimpleVocabulary](https://github.com/deepmipt/DeepPavlov/blob/master/deeppavlov/core/data/simple_vocab.py) implemented in the library will be used to perform those mappings." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from deeppavlov.core.data.simple_vocab import SimpleVocabulary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we need to build dictionaries for tokens and tags. Sometimes there are special tokens in vocabularies, for instance an unknown word token, which is used every time we encounter out of vocabulary word. In our case the only special token will be`` for out of vocabulary words." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:41:29.316 WARNING in 'deeppavlov.core.models.serializable'['serializable'] at line 53: No load path is set for SimpleVocabulary in 'infer' mode. Using save path instead\n", + "2018-06-27 13:41:29.317 WARNING in 'deeppavlov.core.models.serializable'['serializable'] at line 53: No load path is set for SimpleVocabulary in 'infer' mode. Using save path instead\n" + ] + } + ], + "source": [ + "special_tokens = ['']\n", + "\n", + "token_vocab = SimpleVocabulary(special_tokens, save_path='model/token.dict')\n", + "tag_vocab = SimpleVocabulary(save_path='model/tag.dict')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets fit the vocabularies on the train part of the data." + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "all_tokens_by_sentences = [tokens for tokens, tags in dataset['train']]\n", + "all_tags_by_sentences = [tags for tokens, tags in dataset['train']]\n", + "\n", + "token_vocab.fit(all_tokens_by_sentences)\n", + "tag_vocab.fit(all_tags_by_sentences)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Try to get the indices. Keep in mind that we are working with batches of the following structure:\n", + " \n", + " [['utt0_tok0', 'utt1_tok1', ...], ['utt1_tok0', 'utt1_tok1', ...], ...]" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[10167, 6, 168, 7, 6097, 5518, 1865]]" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "token_vocab([['How', 'to', 'do', 'a', 'barrel', 'roll', '?']])" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[0, 0, 0], [3, 5]]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag_vocab([['O', 'O', 'O'], ['B-ORG', 'I-ORG']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will try conversion from indices to tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['into',\n", + " 'another',\n", + " 'CHICAGO',\n", + " 'capital',\n", + " 'But',\n", + " 'Wednesday',\n", + " '20',\n", + " '2',\n", + " 'into',\n", + " 'years']]" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "token_vocab([np.random.randint(0, 512, size=10)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset Iterator\n", + "\n", + "Neural Networks are usually trained with batches. It means that weight updates of the network are based on several sequences at every single time. The tricky part is that all sequences within a batch need to have the same length. So we will pad them with a special `` token. Likewise tokens tags also must be padded It is also a good practice to provide RNN with sequence lengths, so it can skip computations for padding parts. We provide the batching function *batches_generator* readily available for you to save time. \n", + "\n", + "An important concept in the batch generation is shuffling. Shuffling is taking sample from the dataset at random order. It is important to train on the shuffled data because large number consequetive samples of the same class may result in pure quality of the model." + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from deeppavlov.core.data.data_learning_iterator import DataLearningIterator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the dataset iterator from the loaded dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "data_iterator = DataLearningIterator(dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Try it out:" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((['Corinthians', '1', 'Guarani', '0'],\n", + " ['The',\n", + " 'Richmond-based',\n", + " 'retailer',\n", + " 'lost',\n", + " '$',\n", + " '95.7',\n", + " 'million',\n", + " 'in',\n", + " 'the',\n", + " 'fiscal',\n", + " 'year',\n", + " 'ended',\n", + " 'February',\n", + " '3',\n", + " '.']),\n", + " (['B-ORG', 'O', 'B-ORG', 'O'],\n", + " ['O',\n", + " 'B-MISC',\n", + " 'O',\n", + " 'O',\n", + " 'O',\n", + " 'O',\n", + " 'O',\n", + " 'O',\n", + " 'O',\n", + " 'O',\n", + " 'O',\n", + " 'O',\n", + " 'O',\n", + " 'O',\n", + " 'O']))" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(data_iterator.gen_batches(2, shuffle=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Masking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last thing about generating training data. We need to produce a binary mask which is one where tokens present and zero elsewhere. This mask will stop backpropagation through paddings. An instance of such mask:\n", + "\n", + " [[1, 1, 0, 0, 0],\n", + " [1, 1, 1, 1, 1]]\n", + " For the sentences in batch:\n", + "\n", + " [['The', 'roof'],\n", + " ['This', 'is', 'my', 'domain', '!']]\n", + "\n", + "The mask length must be equal to the maximum length of the sentence in the batch." + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.models.preprocessors.mask import Mask\n", + "get_mask = Mask()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Try it out:" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1., 1., 1., 1., 1.],\n", + " [1., 1., 0., 0., 0.]], dtype=float32)" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_mask([['Try', 'to', 'get', 'the', 'mask'], ['Check', 'paddings']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build a recurrent neural network\n", + "\n", + "This is the most important part of the assignment. Here we will specify the network architecture based on TensorFlow building blocks. It's fun and easy as a lego constructor! We will create an Convolutional Neural Network (CNN) network which will produce probability distribution over tags for each token in a sentence. To take into account both right and left contexts of the token, we will use CNN. Dense layer will be used on top to perform tag classification. " + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import numpy as np\n", + "\n", + "np.random.seed(42)\n", + "tf.set_random_seed(42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An essential part of almost every network in NLP domain is embeddings of the words. We pass the text to the network as a series of tokens. Each token is represented by its index. For every token (index) we have a vector. In total the vectors form an embedding matrix. This matrix can be either pretrained using some common algorithm like Skip-Gram or CBOW or it can be initialized by random values and trained along with other parameters of the network. In this tutorial we will follow the second alternative.\n", + "\n", + "We need to build a function that takes the tensor of token indices with shape [batch_size, num_tokens] and for each index in this matrix it retrieves a vector from the embedding matrix, corresponding to that index. That results in a new tensor with sahpe [batch_size, num_tokens, emb_dim]." + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "def get_embeddings(indices, vocabulary_size, emb_dim):\n", + " # Initialize the random gaussian matrix with dimensions [vocabulary_size, embedding_dimension]\n", + " # The **VARIANCE** of the random samples must be 1 / embedding_dimension\n", + " emb_mat = np.random.randn(vocabulary_size, emb_dim).astype(np.float32) / np.sqrt(emb_dim) # YOUR CODE HERE\n", + " emb_mat = tf.Variable(emb_mat, name='Embeddings', trainable=True)\n", + " emb = tf.nn.embedding_lookup(emb_mat, indices)\n", + " return emb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The body of the network is the convolutional layers. The basic idea behind convolutions is to apply the same dense layer to every n consecutive samples (tokens in our case). A simplified case is depicted below.\n", + "\n", + "\n", + "\n", + "Here number of input and output features equal to 1.\n", + "\n", + "Lets try it on a toy example:" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(\"conv1d_6/BiasAdd:0\", shape=(2, 3, 200), dtype=float32)\n" + ] + } + ], + "source": [ + "# Create a tensor with shape [batch_size, number_of_tokens, number_of_features]\n", + "x = tf.random_normal(shape=[2, 10, 100])\n", + "y = tf.layers.conv1d(x, filters=200, kernel_size=8)\n", + "print(y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see due to the abscence of zero padding (zeros on in the beginning and in the end of input) the size of resulting tensor along the token dimension is reduced. To use padding and preserve the dimensionality along the convolution dimension pass padding='same' parameter to the function." + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(\"conv1d_7/BiasAdd:0\", shape=(2, 10, 200), dtype=float32)\n" + ] + } + ], + "source": [ + "y_with_padding = tf.layers.conv1d(x, filters=200, kernel_size=8, padding='same')\n", + "print(y_with_padding)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now stack a number of layers with dimensionality given in n_hidden_list" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "def conv_net(units, n_hidden_list, cnn_filter_width, activation=tf.nn.relu):\n", + " # Use activation(units) to apply activation to units\n", + " for n_hidden in n_hidden_list:\n", + " \n", + " units = tf.layers.conv1d(units,\n", + " n_hidden,\n", + " cnn_filter_width,\n", + " padding='same')\n", + " units = activation(units)\n", + " return units\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A common loss for the classification task is cross-entropy. Why classification? Because for each token the network must decide which tag to predict. The cross-entropy has the following form:\n", + "\n", + "$$ H(P, Q) = -E_{x \\sim P} log Q(x) $$\n", + "\n", + "It measures the dissimilarity between the ground truth distribution over the classes and predicted distribution. In the most of the cases ground truth distribution is one-hot. Luckily this loss is already [implemented](https://www.tensorflow.org/api_docs/python/tf/nn/softmax_cross_entropy_with_logits_v2) in TensorFlow." + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(\"softmax_cross_entropy_with_logits_3/Reshape_2:0\", shape=(1, 4), dtype=float32)\n" + ] + } + ], + "source": [ + "# The logits\n", + "l = tf.random_normal([1, 4, 3]) # shape [batch_size, number_of_tokens, number of classes]\n", + "indices = tf.placeholder(tf.int32, [1, 4])\n", + "\n", + "# Make one-hot distribution from indices for 3 types of tag\n", + "p = tf.one_hot(indices, depth=3)\n", + "loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(labels=p, logits=l)\n", + "print(loss_tensor)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All sentences in the batch have same length and we pad the each sentence to the maximal lendth. So there are paddings at the end and pushing the network to predict those paddings usually results in deteriorated quallity. Then we need to multiply the loss tensor by binary mask to prevent gradient flow from the paddings." + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "mask = tf.placeholder(tf.float32, shape=[1, 4])\n", + "loss_tensor *= mask" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last step to do is to compute the mean value of the loss tensor:" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "loss = tf.reduce_mean(loss_tensor)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now define your own function that returns a scalar masked cross-entropy loss" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "def masked_cross_entropy(logits, label_indices, number_of_tags, mask):\n", + " ground_truth_labels = tf.one_hot(label_indices, depth=number_of_tags)\n", + " loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(labels=ground_truth_labels, logits=logits)\n", + " loss_tensor *= mask\n", + " loss = tf.reduce_mean(loss_tensor)\n", + " return loss" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Put everything into a class:" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import tensorflow as tf\n", + "\n", + "class NerNetwork:\n", + " def __init__(self,\n", + " n_tokens,\n", + " n_tags,\n", + " token_emb_dim=100,\n", + " n_hidden_list=(128,),\n", + " cnn_filter_width=7,\n", + " use_batch_norm=False,\n", + " embeddings_dropout=False,\n", + " top_dropout=False,\n", + " **kwargs):\n", + " \n", + " # ================ Building inputs =================\n", + " \n", + " self.learning_rate_ph = tf.placeholder(tf.float32, [])\n", + " self.dropout_keep_ph = tf.placeholder(tf.float32, [])\n", + " self.token_ph = tf.placeholder(tf.int32, [None, None], name='token_ind_ph')\n", + " self.mask_ph = tf.placeholder(tf.float32, [None, None], name='Mask_ph')\n", + " self.y_ph = tf.placeholder(tf.int32, [None, None], name='y_ph')\n", + " \n", + " # ================== Building the network ==================\n", + " \n", + " # Now embedd the indices of tokens using token_emb_dim function\n", + " \n", + " ######################################\n", + " ########## YOUR CODE HERE ############\n", + " emb = get_embeddings(self.token_ph, n_tokens, token_emb_dim)\n", + " ######################################\n", + "\n", + " emb = tf.nn.dropout(emb, self.dropout_keep_ph, (tf.shape(emb)[0], 1, tf.shape(emb)[2]))\n", + " \n", + " # Build a multilayer CNN on top of the embeddings.\n", + " # The number of units in the each layer must match\n", + " # corresponding number from n_hidden_list.\n", + " # Use ReLU activation \n", + " ######################################\n", + " ########## YOUR CODE HERE ############\n", + " units = conv_net(emb, n_hidden_list, cnn_filter_width)\n", + " ######################################\n", + " units = tf.nn.dropout(units, self.dropout_keep_ph, (tf.shape(units)[0], 1, tf.shape(units)[2]))\n", + " logits = tf.layers.dense(units, n_tags, activation=None)\n", + " self.predictions = tf.argmax(logits, 2)\n", + " \n", + " # ================= Loss and train ops =================\n", + " # Use cross-entropy loss. check the tf.nn.softmax_cross_entropy_with_logits_v2 function\n", + " ######################################\n", + " ########## YOUR CODE HERE ############\n", + " self.loss = masked_cross_entropy(logits, self.y_ph, n_tags, self.mask_ph)\n", + " ######################################\n", + "\n", + " # Create a training operation to update the network parameters.\n", + " # We purpose to use the Adam optimizer as it work fine for the\n", + " # most of the cases. Check tf.train to find an implementation.\n", + " # Put the train operation to the attribute self.train_op\n", + " \n", + " ######################################\n", + " ########## YOUR CODE HERE ############\n", + " optimizer = tf.train.AdamOptimizer(self.learning_rate_ph)\n", + " self.train_op = optimizer.minimize(self.loss)\n", + " ######################################\n", + "\n", + " # ================= Initialize the session =================\n", + " \n", + " self.sess = tf.Session()\n", + " self.sess.run(tf.global_variables_initializer())\n", + "\n", + " def __call__(self, tok_batch, mask_batch):\n", + " feed_dict = {self.token_ph: tok_batch,\n", + " self.mask_ph: mask_batch,\n", + " self.dropout_keep_ph: 1.0}\n", + " return self.sess.run(self.predictions, feed_dict)\n", + "\n", + " def train_on_batch(self, tok_batch, tag_batch, mask_batch, dropout_keep_prob, learning_rate):\n", + " feed_dict = {self.token_ph: tok_batch,\n", + " self.y_ph: tag_batch,\n", + " self.mask_ph: mask_batch,\n", + " self.dropout_keep_ph: dropout_keep_prob,\n", + " self.learning_rate_ph: learning_rate}\n", + " self.sess.run(self.train_op, feed_dict)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now create an instance of the NerNetwork class:" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "nernet = NerNetwork(len(token_vocab),\n", + " len(tag_vocab),\n", + " n_hidden_list=[100, 100])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Regularly we want to check the score on validation part of the dataset every epoch. In the most of the cases of NER tasks the classes are imbalanced. And the accuray is not the best measure of performance. If we have 95% of 'O' tags, than the silly classifier, that always predicts '0' get 95% accuracy. To tackle this issue the F1-score is used. The F1-score can be defined as:\n", + "\n", + "$$ F1 = \\frac{2 P R}{P + R}$$ \n", + "\n", + "where P is precision and R is recall.\n", + "\n", + "Lets write the evaluation function. We need to get all predictions for the given part of the dataset and compute F1." + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.models.ner.evaluation import precision_recall_f1\n", + "# The function precision_recall_f1 takes two lists: y_true and y_predicted\n", + "# the tag sequences for each sentences should be merged into one big list \n", + "from deeppavlov.core.data.utils import zero_pad\n", + "# zero_pad takes a batch of lists of token indices, pad it with zeros to the\n", + "# maximal length and convert it to numpy matrix\n", + "from itertools import chain\n", + "\n", + "\n", + "def eval_valid(network, batch_generator):\n", + " total_true = []\n", + " total_pred = []\n", + " for x, y_true in batch_generator:\n", + "\n", + " # Prepare token indices from tokens batch\n", + " x_inds = token_vocab(x) # YOUR CODE HERE\n", + "\n", + " # Pad the indices batch with zeros\n", + " x_batch = zero_pad(x_inds) # YOUR CODE HERE\n", + "\n", + " # Get the mask using get_mask\n", + " mask = get_mask(x) # YOUR CODE HERE\n", + " \n", + " # We call the instance of the NerNetwork because we have defined __call__ method\n", + " y_inds = network(x_batch, mask)\n", + "\n", + " # For every sentence in the batch extract all tags up to paddings\n", + " y_inds = [y_inds[n][:len(x[n])] for n, y in enumerate(y_inds)] # YOUR CODE HERE\n", + " y_pred = tag_vocab(y_inds)\n", + "\n", + " # Add fresh predictions \n", + " total_true.extend(chain(*y_true))\n", + " total_pred.extend(chain(*y_pred))\n", + " res = precision_recall_f1(total_true, total_pred, print_results=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set hyperparameters. You might want to start with the following recommended values:\n", + "- *batch_size*: 32;\n", + "- n_epochs: 10;\n", + "- starting value of *learning_rate*: 0.001\n", + "- *learning_rate_decay*: a square root of 2;\n", + "- *dropout_keep_probability* equal to 0.7 for training (typical values for dropout probability are ranging from 0.3 to 0.9).\n", + "\n", + "A very efficient technique for the learning rate managment is dropping learning rate after convergence. It is common to use dividers 2, 3, and 10 to drop the learning rate." + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 16 # YOUR HYPERPARAMETER HERE\n", + "n_epochs = 20 # YOUR HYPERPARAMETER HERE\n", + "learning_rate = 0.001 # YOUR HYPERPARAMETER HERE\n", + "dropout_keep_prob = 0.5 # YOUR HYPERPARAMETER HERE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we iterate through dataset batch by batch and pass the data to the train op" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:24.341 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5465 phrases; correct: 3397.\n", + "\n", + "precision: 62.16%; recall: 57.17%; FB1: 59.56\n", + "\n", + "\tLOC: precision: 64.94%; recall: 76.65%; F1: 70.31 2168\n", + "\n", + "\tMISC: precision: 51.44%; recall: 19.41%; F1: 28.19 348\n", + "\n", + "\tORG: precision: 50.16%; recall: 46.61%; F1: 48.32 1246\n", + "\n", + "\tPER: precision: 69.58%; recall: 64.33%; F1: 66.85 1703\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:27.357 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5615 phrases; correct: 4447.\n", + "\n", + "precision: 79.20%; recall: 74.84%; FB1: 76.96\n", + "\n", + "\tLOC: precision: 86.10%; recall: 84.00%; F1: 85.04 1792\n", + "\n", + "\tMISC: precision: 67.88%; recall: 68.76%; F1: 68.32 934\n", + "\n", + "\tORG: precision: 75.43%; recall: 61.82%; F1: 67.95 1099\n", + "\n", + "\tPER: precision: 80.50%; recall: 78.23%; F1: 79.35 1790\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:30.326 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5387 phrases; correct: 4584.\n", + "\n", + "precision: 85.09%; recall: 77.15%; FB1: 80.93\n", + "\n", + "\tLOC: precision: 89.46%; recall: 85.90%; F1: 87.64 1764\n", + "\n", + "\tMISC: precision: 85.34%; recall: 73.86%; F1: 79.19 798\n", + "\n", + "\tORG: precision: 80.15%; recall: 70.77%; F1: 75.17 1184\n", + "\n", + "\tPER: precision: 83.85%; recall: 74.70%; F1: 79.01 1641\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:33.277 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5436 phrases; correct: 4702.\n", + "\n", + "precision: 86.50%; recall: 79.13%; FB1: 82.65\n", + "\n", + "\tLOC: precision: 89.44%; recall: 86.72%; F1: 88.06 1781\n", + "\n", + "\tMISC: precision: 88.02%; recall: 75.70%; F1: 81.40 793\n", + "\n", + "\tORG: precision: 83.25%; recall: 71.14%; F1: 76.72 1146\n", + "\n", + "\tPER: precision: 84.91%; recall: 79.10%; F1: 81.90 1716\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:36.266 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5298 phrases; correct: 4688.\n", + "\n", + "precision: 88.49%; recall: 78.90%; FB1: 83.42\n", + "\n", + "\tLOC: precision: 92.71%; recall: 85.90%; F1: 89.18 1702\n", + "\n", + "\tMISC: precision: 89.10%; recall: 77.98%; F1: 83.17 807\n", + "\n", + "\tORG: precision: 82.95%; recall: 75.09%; F1: 78.83 1214\n", + "\n", + "\tPER: precision: 87.87%; recall: 75.14%; F1: 81.01 1575\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:39.228 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5339 phrases; correct: 4705.\n", + "\n", + "precision: 88.13%; recall: 79.18%; FB1: 83.41\n", + "\n", + "\tLOC: precision: 90.68%; recall: 86.88%; F1: 88.74 1760\n", + "\n", + "\tMISC: precision: 86.87%; recall: 78.20%; F1: 82.31 830\n", + "\n", + "\tORG: precision: 83.80%; recall: 74.42%; F1: 78.83 1191\n", + "\n", + "\tPER: precision: 89.22%; recall: 75.46%; F1: 81.76 1558\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:42.213 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5412 phrases; correct: 4789.\n", + "\n", + "precision: 88.49%; recall: 80.60%; FB1: 84.36\n", + "\n", + "\tLOC: precision: 93.45%; recall: 86.94%; F1: 90.07 1709\n", + "\n", + "\tMISC: precision: 89.57%; recall: 79.18%; F1: 84.05 815\n", + "\n", + "\tORG: precision: 81.04%; recall: 76.81%; F1: 78.87 1271\n", + "\n", + "\tPER: precision: 88.56%; recall: 77.74%; F1: 82.80 1617\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:45.169 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5388 phrases; correct: 4763.\n", + "\n", + "precision: 88.40%; recall: 80.16%; FB1: 84.08\n", + "\n", + "\tLOC: precision: 91.84%; recall: 88.19%; F1: 89.98 1764\n", + "\n", + "\tMISC: precision: 87.17%; recall: 78.85%; F1: 82.80 834\n", + "\n", + "\tORG: precision: 82.20%; recall: 75.09%; F1: 78.49 1225\n", + "\n", + "\tPER: precision: 90.03%; recall: 76.49%; F1: 82.71 1565\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:48.170 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5345 phrases; correct: 4722.\n", + "\n", + "precision: 88.34%; recall: 79.47%; FB1: 83.67\n", + "\n", + "\tLOC: precision: 92.03%; recall: 87.43%; F1: 89.67 1745\n", + "\n", + "\tMISC: precision: 88.51%; recall: 79.39%; F1: 83.70 827\n", + "\n", + "\tORG: precision: 80.78%; recall: 75.84%; F1: 78.23 1259\n", + "\n", + "\tPER: precision: 90.29%; recall: 74.21%; F1: 81.47 1514\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:51.116 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5273 phrases; correct: 4716.\n", + "\n", + "precision: 89.44%; recall: 79.37%; FB1: 84.10\n", + "\n", + "\tLOC: precision: 92.76%; recall: 87.21%; F1: 89.90 1727\n", + "\n", + "\tMISC: precision: 90.83%; recall: 79.50%; F1: 84.79 807\n", + "\n", + "\tORG: precision: 82.51%; recall: 75.99%; F1: 79.11 1235\n", + "\n", + "\tPER: precision: 90.56%; recall: 73.94%; F1: 81.41 1504\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:54.39 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5183 phrases; correct: 4632.\n", + "\n", + "precision: 89.37%; recall: 77.95%; FB1: 83.27\n", + "\n", + "\tLOC: precision: 93.43%; recall: 85.90%; F1: 89.51 1689\n", + "\n", + "\tMISC: precision: 90.86%; recall: 79.83%; F1: 84.99 810\n", + "\n", + "\tORG: precision: 83.84%; recall: 75.47%; F1: 79.43 1207\n", + "\n", + "\tPER: precision: 88.42%; recall: 70.90%; F1: 78.70 1477\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:56.988 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5162 phrases; correct: 4587.\n", + "\n", + "precision: 88.86%; recall: 77.20%; FB1: 82.62\n", + "\n", + "\tLOC: precision: 92.82%; recall: 85.90%; F1: 89.23 1700\n", + "\n", + "\tMISC: precision: 90.65%; recall: 79.93%; F1: 84.96 813\n", + "\n", + "\tORG: precision: 82.99%; recall: 74.94%; F1: 78.76 1211\n", + "\n", + "\tPER: precision: 88.11%; recall: 68.78%; F1: 77.26 1438\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:45:59.925 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5220 phrases; correct: 4630.\n", + "\n", + "precision: 88.70%; recall: 77.92%; FB1: 82.96\n", + "\n", + "\tLOC: precision: 93.66%; recall: 86.12%; F1: 89.73 1689\n", + "\n", + "\tMISC: precision: 90.63%; recall: 79.72%; F1: 84.82 811\n", + "\n", + "\tORG: precision: 82.79%; recall: 76.06%; F1: 79.28 1232\n", + "\n", + "\tPER: precision: 86.90%; recall: 70.20%; F1: 77.66 1488\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:46:02.877 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5220 phrases; correct: 4650.\n", + "\n", + "precision: 89.08%; recall: 78.26%; FB1: 83.32\n", + "\n", + "\tLOC: precision: 93.30%; recall: 86.45%; F1: 89.74 1702\n", + "\n", + "\tMISC: precision: 90.98%; recall: 79.83%; F1: 85.04 809\n", + "\n", + "\tORG: precision: 82.74%; recall: 75.09%; F1: 78.73 1217\n", + "\n", + "\tPER: precision: 88.40%; recall: 71.61%; F1: 79.12 1492\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:46:05.850 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5220 phrases; correct: 4678.\n", + "\n", + "precision: 89.62%; recall: 78.73%; FB1: 83.82\n", + "\n", + "\tLOC: precision: 92.90%; recall: 86.94%; F1: 89.82 1719\n", + "\n", + "\tMISC: precision: 90.42%; recall: 79.83%; F1: 84.79 814\n", + "\n", + "\tORG: precision: 83.96%; recall: 74.94%; F1: 79.20 1197\n", + "\n", + "\tPER: precision: 89.93%; recall: 72.75%; F1: 80.43 1490\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:46:08.832 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5163 phrases; correct: 4599.\n", + "\n", + "precision: 89.08%; recall: 77.40%; FB1: 82.83\n", + "\n", + "\tLOC: precision: 91.82%; recall: 86.77%; F1: 89.22 1736\n", + "\n", + "\tMISC: precision: 91.66%; recall: 79.83%; F1: 85.33 803\n", + "\n", + "\tORG: precision: 85.16%; recall: 73.60%; F1: 78.96 1159\n", + "\n", + "\tPER: precision: 87.51%; recall: 69.60%; F1: 77.53 1465\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:46:11.711 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5200 phrases; correct: 4672.\n", + "\n", + "precision: 89.85%; recall: 78.63%; FB1: 83.86\n", + "\n", + "\tLOC: precision: 93.62%; recall: 86.23%; F1: 89.77 1692\n", + "\n", + "\tMISC: precision: 89.32%; recall: 79.83%; F1: 84.31 824\n", + "\n", + "\tORG: precision: 83.79%; recall: 75.17%; F1: 79.25 1203\n", + "\n", + "\tPER: precision: 90.75%; recall: 72.96%; F1: 80.89 1481\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:46:14.542 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5196 phrases; correct: 4645.\n", + "\n", + "precision: 89.40%; recall: 78.17%; FB1: 83.41\n", + "\n", + "\tLOC: precision: 93.58%; recall: 85.68%; F1: 89.46 1682\n", + "\n", + "\tMISC: precision: 88.96%; recall: 79.50%; F1: 83.96 824\n", + "\n", + "\tORG: precision: 83.78%; recall: 74.72%; F1: 78.99 1196\n", + "\n", + "\tPER: precision: 89.42%; recall: 72.53%; F1: 80.10 1494\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:46:17.420 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5140 phrases; correct: 4553.\n", + "\n", + "precision: 88.58%; recall: 76.62%; FB1: 82.17\n", + "\n", + "\tLOC: precision: 93.59%; recall: 85.85%; F1: 89.55 1685\n", + "\n", + "\tMISC: precision: 90.83%; recall: 79.50%; F1: 84.79 807\n", + "\n", + "\tORG: precision: 85.03%; recall: 74.57%; F1: 79.46 1176\n", + "\n", + "\tPER: precision: 84.44%; recall: 67.48%; F1: 75.02 1472\n", + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluating the model on valid part of the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:46:20.380 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 51362 tokens with 5942 phrases; found: 5196 phrases; correct: 4666.\n", + "\n", + "precision: 89.80%; recall: 78.53%; FB1: 83.79\n", + "\n", + "\tLOC: precision: 91.30%; recall: 88.02%; F1: 89.63 1771\n", + "\n", + "\tMISC: precision: 92.10%; recall: 79.61%; F1: 85.40 797\n", + "\n", + "\tORG: precision: 85.65%; recall: 74.79%; F1: 79.86 1171\n", + "\n", + "\tPER: precision: 90.05%; recall: 71.23%; F1: 79.54 1457\n", + "\n", + "\n" + ] + } + ], + "source": [ + "for epoch in range(n_epochs):\n", + " for x, y in data_iterator.gen_batches(batch_size, 'train'):\n", + " # Convert tokens to indices via Vocab\n", + " x_inds = token_vocab(x) # YOUR CODE \n", + " # Convert tags to indices via Vocab\n", + " y_inds = tag_vocab(y) # YOUR CODE \n", + " \n", + " # Pad every sample with zeros to the maximal length\n", + " x_batch = zero_pad(x_inds)\n", + " y_batch = zero_pad(y_inds)\n", + "\n", + " mask = get_mask(x)\n", + " nernet.train_on_batch(x_batch, y_batch, mask, dropout_keep_prob, learning_rate)\n", + " print('Evaluating the model on valid part of the dataset')\n", + " eval_valid(nernet, data_iterator.gen_batches(batch_size, 'valid'))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Eval the model on test part now" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-27 13:46:35.397 DEBUG in 'deeppavlov.models.ner.evaluation'['evaluation'] at line 213: processed 46435 tokens with 5648 phrases; found: 4561 phrases; correct: 3738.\n", + "\n", + "precision: 81.96%; recall: 66.18%; FB1: 73.23\n", + "\n", + "\tLOC: precision: 84.02%; recall: 82.25%; F1: 83.13 1633\n", + "\n", + "\tMISC: precision: 81.80%; recall: 71.08%; F1: 76.07 610\n", + "\n", + "\tORG: precision: 81.25%; recall: 60.26%; F1: 69.20 1232\n", + "\n", + "\tPER: precision: 79.74%; recall: 53.56%; F1: 64.08 1086\n", + "\n", + "\n" + ] + } + ], + "source": [ + "eval_valid(nernet, data_iterator.gen_batches(batch_size, 'test'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets try to infer the model on our sentence:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Petr', 'stole', 'my', 'vodka']\n", + "['B-PER', 'O', 'O', 'O']\n" + ] + } + ], + "source": [ + "sentence = 'Petr stole my vodka'\n", + "x = [sentence.split()]\n", + "\n", + "x_inds = token_vocab(x)\n", + "x_batch = zero_pad(x_inds)\n", + "mask = get_mask(x)\n", + "y_inds = nernet(x_batch, mask)\n", + "print(x[0])\n", + "print(tag_vocab(y_inds)[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/tutorials/02_deeppavlov_ner.pdf b/examples/tutorials/02_deeppavlov_ner.pdf new file mode 100644 index 0000000000..1352158e92 Binary files /dev/null and b/examples/tutorials/02_deeppavlov_ner.pdf differ diff --git a/examples/tutorials/03_deeppavlov_gobot.ipynb b/examples/tutorials/03_deeppavlov_gobot.ipynb new file mode 100644 index 0000000000..cc4d29278a --- /dev/null +++ b/examples/tutorials/03_deeppavlov_gobot.ipynb @@ -0,0 +1,2162 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import copy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# pretty prints\n", + "from pprint import pprint" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# creating directory for json configs\n", + "import os\n", + "\n", + "if not os.path.isdir(\"gobot\"):\n", + " os.mkdir(\"gobot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import deeppavlov" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**NOTE**: \"go_bot\" model trains faster on a CPU, so let's ignore existing GPUs:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuda visible devices = ''\r\n" + ] + } + ], + "source": [ + "!export CUDA_VISIBLE_DEVICES=\"\"\n", + "!echo \"cuda visible devices = '\"$CUDA_VISIBLE_DEVICES\"'\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hybrid goal-oriented bot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dialog bots are categorized into two types:\n", + "\n", + "1. **goal-oriented models **\n", + "\n", + " (those who have to achieve some kind of a goal in the end of conversation: \n", + " - restaurant and flight booking,\n", + " - customer support service,\n", + " - etc.);\n", + " \n", + "2. **chit-chat models **\n", + "\n", + " (those who chat just for fun, the longer bot speaks with you the better, example:\n", + " - \"replica\" mobile application).\n", + "\n", + "We will only dive into goal-oriented task specification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![go bot architecture 00](img/bot_architecture00.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A classical dialog system consists of:\n", + "\n", + "1. **Natural Language Understanding component (NLU)**\n", + "\n", + " that is intended to \"understand\" human and represent it's \"understanding\" in a machine readable format. \n", + "\n", + " It takes an utterance text as input and converts to a dialog \"frame\".\n", + "\n", + " \"Frame\" may consist of:\n", + " - domain value (domain is some kind of \"a type of dialogs\");\n", + " - intent value (intent of current human utterance: \"welcome_message\", \"asking_weather\", etc.);\n", + " - entity slots (entities are mentioned by human \"location\", \"time\", etc.).\n", + "\n", + "2. **Dialogue Manager component (DM)**\n", + "\n", + " that is intended to decide what to respond. \n", + " \n", + " It takes a filled by NLU frame and outputs action (it isn't a final text, it is a label). \n", + " \n", + " For example, there may be actions: \"say_welcome\", \"say_goodbye\", \"ask_location\", \"give_weather\", etc.\n", + "\n", + "3. **Natural Language Generation component(NLG)**\n", + "\n", + " that is intended to convert action to an actual text response representation.\n", + " \n", + " For example, \"say_goodbye\" -> \"You are welcome!\"." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NLU " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![go bot architecture 01](img/bot_architecture01.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's consider a dialog system with NLU component that consists of a single Named Entity Recognition \n", + " component (or NER)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One of the previous tutorials introduced deeppavlov NER model and showed how to use it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DM & NLG" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The tutorial is focused on how to implement\n", + " - Dialogue Manager and\n", + " - Natural Language Generator." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will train the chatbot on a [Dialog State Tracking Chellenge 2](http://camdial.org/~mh521/dstc/) data.\n", + "\n", + "Let's download it first." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-07-09 13:43:12.972 INFO in 'deeppavlov.dataset_readers.dstc2_reader'['dstc2_reader'] at line 197: [downloading data from http://lnsigo.mipt.ru/export/datasets/dstc2_v2.tar.gz to tmp/my_download_of_dstc2]\n", + "2018-07-09 13:43:12.976 INFO in 'requests.packages.urllib3.connectionpool'['connectionpool'] at line 203: Starting new HTTP connection (1): lnsigo.mipt.ru\n", + "2018-07-09 13:43:13.554 DEBUG in 'requests.packages.urllib3.connectionpool'['connectionpool'] at line 383: \"GET /export/datasets/dstc2_v2.tar.gz HTTP/1.1\" 200 506300\n", + "2018-07-09 13:43:13.557 INFO in 'deeppavlov.core.data.utils'['utils'] at line 65: Downloading from http://lnsigo.mipt.ru/export/datasets/dstc2_v2.tar.gz to /home/vimary/ipavlov/Pilot/examples/tutorials/tmp/my_download_of_dstc2/dstc2_v2.tar.gz\n", + "100%|██████████| 506k/506k [00:00<00:00, 1.62MB/s]\n", + "2018-07-09 13:43:13.875 INFO in 'deeppavlov.core.data.utils'['utils'] at line 149: Extracting tmp/my_download_of_dstc2/dstc2_v2.tar.gz archive into tmp/my_download_of_dstc2\n", + "2018-07-09 13:43:13.925 INFO in 'deeppavlov.dataset_readers.dstc2_reader'['dstc2_reader'] at line 214: [loading dialogs from tmp/my_download_of_dstc2/dstc2-trn.jsonlist]\n", + "2018-07-09 13:43:14.54 INFO in 'deeppavlov.dataset_readers.dstc2_reader'['dstc2_reader'] at line 214: [loading dialogs from tmp/my_download_of_dstc2/dstc2-val.jsonlist]\n", + "2018-07-09 13:43:14.154 INFO in 'deeppavlov.dataset_readers.dstc2_reader'['dstc2_reader'] at line 214: [loading dialogs from tmp/my_download_of_dstc2/dstc2-tst.jsonlist]\n" + ] + } + ], + "source": [ + "from deeppavlov.dataset_readers.dstc2_reader import DSTC2Version2DatasetReader\n", + "\n", + "data = DSTC2Version2DatasetReader().read(data_path=\"tmp/my_download_of_dstc2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`DSTC2Version2DatasetReader` downloaded the needed data and saved to disk.\n", + "\n", + "`DialogDatasetIterator` took the data as input and transformed it to batches." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.dataset_iterators.dialog_iterator import DialogDatasetIterator\n", + "\n", + "batches_generator = DialogDatasetIterator(data, seed=1443, shuffle=True)\\\n", + " .gen_batches(batch_size=4, data_type='train')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "-------------\n", + "-------------\n", + " Let's take a closer look at a batch content:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "batch = batches_generator.__next__()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each batch is a tuple of two elements:\n", + " - list of x's and\n", + " - list of y's" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "x_batch, y_batch = batch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`x_batch` (and `y_batch`) consists of 4 samples. This is because `batch_size` was 4." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(x_batch)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One sample is a dialog. This is how one turn from a dialog looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----0th turn----\n", + "( {'intents': [], 'prev_resp_act': None, 'text': ''},\n", + " { 'act': 'welcomemsg',\n", + " 'text': 'Hello, welcome to the Cambridge restaurant system. '\n", + " 'You can ask for restaurants by area, price range or '\n", + " 'food type. How may I help you?'})\n" + ] + } + ], + "source": [ + "dialog_id = 0\n", + "dialog = [(x, y) for x, y in zip(x_batch[dialog_id], y_batch[dialog_id])]\n", + "\n", + "turn_id = 0\n", + "print(\"----{}th turn----\".format(turn_id)) \n", + "pprint(dialog[turn_id], indent=8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is how the whole dialog looks (printing `'text'` parts of `x_batch` and `y_batch`):" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: \n", + ">> Hello, welcome to the Cambridge restaurant system. You can ask for restaurants by area, price range or food type. How may I help you? \n", + "\n", + ":: traditional\n", + ">> api_call area=\"dontcare\" food=\"traditional\" pricerange=\"dontcare\" \n", + "\n", + ":: traditional\n", + ">> I am sorry but there is no traditional restaurant that matches your request. \n", + "\n", + ":: italian food\n", + ">> What part of town do you have in mind? \n", + "\n", + ":: south\n", + ">> api_call area=\"south\" food=\"italian\" pricerange=\"dontcare\" \n", + "\n", + ":: south\n", + ">> Pizza hut cherry hinton is a nice place in the south of town serving tasty italian food. \n", + "\n", + ":: phone number\n", + ">> The phone number of pizza hut cherry hinton is 01223 323737. \n", + "\n", + ":: post code\n", + ">> The post code of pizza hut cherry hinton is C.B 1, 7 D.Y. \n", + "\n", + ":: thank you good bye\n", + ">> You are welcome! \n", + "\n" + ] + } + ], + "source": [ + "for turn in dialog:\n", + " x, y = turn\n", + " print('::', x['text']) \n", + " print('>>', y['text'], '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### HowTo: DeepPavlov configs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DeepPavlov uses json configs as a tool to configure data pipelines.\n", + "\n", + "Config has the following sections:\n", + " \n", + " - **dataset_reader**\n", + " - configuration of dataset reader component (is responsible for data download and saving to disk);\n", + " \n", + " - **dataset_iterator**\n", + " - configuration of dataset iterator component (is responsible for making batches (sequences) of data that will be further fed to pipe components);\n", + " \n", + " - **metadata**\n", + " - extra info (urls for data download and telegram configuration);\n", + "\n", + " - **train**\n", + " - training process configuration (size of batches, number of training epochs, etc.);\n", + " \n", + " - **chainer**\n", + " - specifies data flow (which components are run and in what order);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's construct a simple config that builds a dictionary of input sample tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_config = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **dataset_reader** -- configuration of dataset reader component (that is responsible for data download and saving to disk)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "dstc2_reader_comp_config = {\n", + " 'name': 'dstc2_v2_reader',\n", + " 'data_path': 'dstc2_v2'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_config['dataset_reader'] = dstc2_reader_comp_config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **dataset_iterator** -- configuration of dataset iterator component (that is responsible for making batches (sequences) of data that will be further fed to pipe components)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "dialog_iterator_comp_config = {\n", + " 'name': 'dialog_iterator'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_config['dataset_iterator'] = dialog_iterator_comp_config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **metadata** -- some extra info\n", + " - **metadata.download** -- a list of data which should be downloaded in order for config to work" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "dstc2_download_config = {\n", + " 'url': 'http://lnsigo.mipt.ru/export/datasets/dstc2_v2.tar.gz',\n", + " 'subdir': 'dstc2_v2'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_config['metadata'] = {}\n", + "vocab_config['metadata']['download'] = [\n", + " dstc2_download_config\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **train** -- training process configuration\n", + " \n", + "We don't need to train anything now, just build (fit on whole dataset once) a dictionary, so \"train\" section is empty." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_config['train'] = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " - **chainer** specifies data flow:\n", + " \n", + " - **chainer.in** -- is a list of input sample names (one data sample might consist of several variables);\n", + " - **chainer.in_y** -- is a list of input label names (each sample might have labels of different kind);\n", + " - **chainer.out** -- is a list of output prediction names (usually has the same length as \"chainer.in_y\");\n", + " \n", + "X is only an utterance here.\n", + "\n", + "Y is empty (we don't need to train the dictionary like neural networks)\n", + "\n", + "There is no prediction for the config, nothing to predict." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_config['chainer'] = {}\n", + "vocab_config['chainer']['in'] = ['utterance']\n", + "vocab_config['chainer']['in_y'] = []\n", + "vocab_config['chainer']['out'] = []" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **chainer**\n", + " - **chainer.pipe** -- is a list of consequently run components. This is the place where you specify in which order and what kind of data will be fed to components. \n", + " \n", + "Our pipe consists of one component -- \"default_vocab\"." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### HowTo: Component config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Component configs are always just a part of a global model config (described above).\n", + "\n", + "Config for any component contains the following **_required_** parameters:\n", + " - **name** -- registered name of a component (it is a link to python component implementation)\n", + " - **save_path** -- path to save the component (sometimes is not needed, for example, for tokenizers)\n", + " - **load_path** -- path to load the component (sometimes is not needed, for examples, for tokenizers)\n", + "\n", + "and the following **_optional_** parameters:\n", + " - **id** -- reference name for the component\n", + " - **ref** -- \"id\" of a component that was previously initialized. It can be used instead of \"name\".\n", + " - **fit\\_on** -- a list of data fields to fit on (it calls \\_\\_fit\\_\\_ method of the component)\n", + " - **in** -- a list of data fields that are inputs during inference (prediction)\n", + " - **out** -- a list of data fields that are outputs during inference (prediction)\n", + " \n", + " \n", + "\"default_vocab\" component also has it's on unique parameters:\n", + " - level -- on which level to operate ('token' level and 'char' (character) level are available)\n", + " - tokenizer -- if input is a string, then it will be tokenized by the tokenizer, _optional parameter_" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_comp_config = {\n", + " 'name': 'default_vocab',\n", + " 'save_path': 'vocabs/token.dict',\n", + " 'load_path': 'vocabs/token.dict',\n", + " 'fit_on': ['utterance'],\n", + " 'level': 'token',\n", + " 'tokenizer': {'name': 'split_tokenizer'},\n", + " 'main': True\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_config['chainer']['pipe'] = [\n", + " vocab_comp_config\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "json.dump(vocab_config, open(\"gobot/vocab_config.json\", 'wt'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To download \"dstc2_v2\" dataset use `deeppavlov.deep_download` script (you have to do it only once):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.download import deep_download # it is called \"deep\" in honor of \"Deep Pavlov\"\n", + "\n", + "deep_download(['--config', 'gobot/vocab_config.json'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All data and models are saved to root of deeppavlov module + `../download` (`DEEPPAVLOV_ROOT/../download/`)." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "dstc2_v2_path = deeppavlov.__path__[0] + '/../download/dstc2_v2'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data was downloaded to `dstc2_v2_path`:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> ls /home/vimary/ipavlov/Pilot/deeppavlov/../download/dstc2_v2\n", + "dstc2-templates.txt dstc2-tst.jsonlist resto.sqlite\n", + "dstc2-trn.jsonlist dstc2-val.jsonlist\n" + ] + } + ], + "source": [ + "# The command will only work for linux, do not panic otherwise -- it isn't something crucially important.\n", + "# You can further just comment bash commands.\n", + "!echo \"> ls $dstc2_v2_path\"\n", + "!ls $dstc2_v2_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's build our vocabulary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.commands.train import train_evaluate_model_from_config\n", + "\n", + "train_evaluate_model_from_config(\"gobot/vocab_config.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vocabulary was built on data and saved to disk.\n", + "\n", + "`save_path = 'vocabs/token.dict'` and component files are saved to `DEEPPAVLOV_ROOT/../download/vocabs/token.dict`." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "vocabs_path = deeppavlov.__path__[0] + '/../download/vocabs'" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> ls /home/vimary/ipavlov/Pilot/deeppavlov/../download/vocabs\n", + "token.dict\n" + ] + } + ], + "source": [ + "!echo \"> ls $vocabs_path\"\n", + "!ls $vocabs_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the content of the saved \"token.dict\":" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> head /home/vimary/ipavlov/Pilot/deeppavlov/../download/vocabs/token.dict\n", + "cheap\t391\n", + "restaurant\t1179\n", + "any\t259\n", + "south\t306\n", + "address\t781\n", + "phone\t819\n", + "number\t825\n", + "thank\t998\n", + "you\t1055\n", + "good\t891\n" + ] + } + ], + "source": [ + "!echo \"> head $vocabs_path/token.dict\"\n", + "!head $vocabs_path/token.dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Using trained component" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use built vocabulary by initializing it with `build_model_from_config`.\n", + "\n", + "We need to add `in` and `out` to component configuration ( to know what are inputs and outputs during prediction ) :\n", + " - **in** -- a list of data fields that are inputs during inference (prediction)\n", + " - **out** -- a list of data fields that are outputs during inference (prediction)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_comp_config['in'] = ['utterance']\n", + "vocab_comp_config['out'] = ['utterance_token_indices']\n", + "\n", + "vocab_config['chainer']['pipe'] = [\n", + " vocab_comp_config\n", + "]\n", + "vocab_config['chainer']['out'] = ['utterance_token_indices']" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-29 14:39:57.111 INFO in 'deeppavlov.core.data.vocab'['vocab'] at line 162: [loading vocabulary from /home/vimary/ipavlov/Pilot/download/vocabs/token.dict]\n" + ] + } + ], + "source": [ + "from deeppavlov.core.commands.infer import build_model_from_config\n", + "\n", + "model = build_model_from_config(vocab_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Model expects a list of samples (batch) as input." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[141]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model(['hi'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model `gobot_dstc2_simple`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's train a simple goal-oriented bot:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.download import deep_download\n", + "from deeppavlov.core.commands.train import train_evaluate_model_from_config\n", + "from deeppavlov.core.commands.infer import build_model_from_config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"dataset_reader\", \"dataset_iterator\" and \"metadata\" will be the same as for vocabulary only." + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "simple_config = {}\n", + "\n", + "simple_config['dataset_reader'] = dstc2_reader_comp_config\n", + "simple_config['dataset_iterator'] = dialog_iterator_comp_config\n", + "simple_config['metadata'] = {}\n", + "simple_config['metadata']['download'] = [\n", + " dstc2_download_config\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "X here is a dict `'x'` containing context 'text', 'intents', db_result', 'prev_resp_act'\n", + "\n", + "Y here is a dict `'y'` containing response 'act' and 'text'\n", + "\n", + "Prediction `'y_predicted'` here will be only 'text'" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "simple_config['chainer'] = {}\n", + "simple_config['chainer']['in'] = ['x']\n", + "simple_config['chainer']['in_y'] =['y']\n", + "simple_config['chainer']['out'] = ['y_predicted']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The bot consists (`pipe` section) of two components:\n", + "- **`default_vocab`** (or DefaultVocabulary) component that \n", + "\n", + " - remembers all tokens from user utterances. \n", + " - `DefaultVocabulary.__call__` method inputs batch of tokens and outputs their indeces." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vocabulary component will be the same as before, but let's add reference to component using `id` \n", + "- **id** -- reference name for the component" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_comp_config = {\n", + " 'name': 'default_vocab',\n", + " 'id': 'token_vocab',\n", + " 'load_path': 'vocabs/token.dict',\n", + " 'save_path': 'vocabs/token.dict',\n", + " 'fit_on': ['x'],\n", + " 'level': 'token',\n", + " 'tokenizer': {'name': 'split_tokenizer'}\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding vocabulary to chainer:" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "simple_config['chainer']['pipe'] = []\n", + "simple_config['chainer']['pipe'].append(vocab_comp_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **`go_bot`** (or GoalOrientedBot) component that\n", + " - calls `slot_filler` that for user utterance outputs mentioned slots \n", + " (for example, \"i want cheap food\" -> {'pricerange': 'cheap'})\n", + " - updates dialog state with `tracker` (DialogStateTracker)\n", + " \n", + " (for example, if old state was {'location': 'north'}, \n", + " and current slots are {'pricerange': 'cheap'}, \n", + " then new dialog state will be {'location': 'north', 'pricerange': 'cheap'})\n", + " - converts user utterance in string format (`x`) to tokens with `tokenizer`\n", + "\n", + " (for example, \"hi, i want some cheap food\" -> ['hi', ',', 'i', 'want', 'some', 'cheap', 'food'])\n", + " - then embeds the tokens with bag-of-words using `bow_embedder`(if not None) and `word_vocab`\n", + "\n", + " (for example, \"cheap\" -> [1, 0, 0, 0, .., 0])\n", + " - embeds the utterance with continuous `embedder` (if not None) as a mean of embeddings of utterance tokens\n", + " \n", + " (for example, \"i\" -> [0.1231, 0.23423, .., 0.03489])\n", + " - concatenates embeddings and passes it as an input to a recurrent neural network (RNN)\n", + " - trains RNN (with LongShortTermMemory (LSTM) as a core graph) that outputs an action label\n", + " - loads templates (mapping from labels to string) using `template_path` and `template_type` and converts action label to string\n", + " \n", + " (for example, \"bye_msg\" -> \"You are welcome!\")\n", + " - fills result string with slot values from dialog state\n", + " \n", + " (for example, if\n", + " dialog state is equal to {'pricerange': 'cheap'}\n", + " and output string is \"There are no restaurants in a #pricerange pricerange\"\n", + " then the result response will be \"There are no restaurants in a cheap pricerange\")" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "bot_comp_config = {\n", + " 'name': 'go_bot',\n", + " 'in': ['x'],\n", + " 'in_y': ['y'],\n", + " 'out': ['y_predicted'],\n", + " 'word_vocab': None,\n", + " 'bow_embedder': {\"name\": \"bow\"},\n", + " 'embedder': None,\n", + " 'slot_filler': None,\n", + " 'template_path': 'dstc2_v2/dstc2-templates.txt',\n", + " 'template_type': 'DualTemplate',\n", + " 'database': None,\n", + " 'api_call_action': 'api_call',\n", + " 'network_parameters': {\n", + " 'load_path': 'gobot_dstc2_simple/model',\n", + " 'save_path': 'gobot_dstc2_simple/model',\n", + " 'dense_size': 64,\n", + " 'hidden_size': 128,\n", + " 'learning_rate': 0.002,\n", + " 'attention_mechanism': None\n", + " },\n", + " 'tokenizer': {'name': 'stream_spacy_tokenizer',\n", + " 'lowercase': False},\n", + " 'tracker': {'name': 'featurized_tracker',\n", + " 'slot_names': ['pricerange', 'this', 'area', 'food', 'name']},\n", + " 'main': True,\n", + " 'debug': False\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is how we use vocabulary by reference:" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "bot_comp_config['word_vocab'] = '#token_vocab'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Announcing slot filler component.\n", + "We assume that slot filler is already trained, and use it by referencing it's config." + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "slot_filler_comp_config = {\n", + " 'config_path': deeppavlov.__path__[0] + '/../deeppavlov/configs/ner/slotfill_dstc2.json'\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding slot filler to bot component:" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "bot_comp_config['slot_filler'] = slot_filler_comp_config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding `bot_comp_config` to `pipe`:" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "simple_config['chainer']['pipe'].append(bot_comp_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Neural network (in the bot) is trained in epochs, and needs data in the form of batches.\n", + "\n", + "That is why we are now filling \"train\" section with training parameters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- **train** -- training process configuration\n", + " - **train.batch_size** is a number of samples in a batch (feeded to the network during one training step)\n", + " - **train.epochs** is a number of iterations over dataset during training\n", + " - **train.log_every_n_batches** and **train.log_every_n_epochs** control frequency of logging messages\n", + " - **train.metrics** is a list of metrics used to validate our performance\n", + " - **train.val_every_n_batches** and **train.val_every_n_epochs** describes how often we calculate metrics on `valid` data split\n", + " - **train.validation_patience** is a number of epochs without metric improvement on `valid` data that we are able to endure =)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "simple_bot_train_config = {\n", + " 'batch_size': 4,\n", + " 'epochs': 2,\n", + " 'log_every_n_batches': -1,\n", + " 'log_every_n_epochs': 1,\n", + " 'metrics': ['per_item_dialog_accuracy'],\n", + " 'val_every_n_epochs': 1,\n", + " 'validation_patience': 20\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "simple_config['train'] = simple_bot_train_config" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "json.dump(simple_config, open(\"gobot/simple_config.json\", 'wt'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`train.epochs` is set to '2' for now, if you intend to train a smarter model, you should increase it (a range from 10 to 200 epochs is recommended)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "deep_download(['--config', slot_filler_comp_config['config_path']])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_evaluate_model_from_config(\"gobot/simple_config.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's comminicate with the resulting bot. \"exit\" message initiates end of dialogue." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = build_model_from_config(simple_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-29 14:42:17.99 WARNING in 'deeppavlov.models.go_bot.bot'['bot'] at line 343: No database specified.\n", + "2018-06-29 14:42:17.99 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 344: Made api_call with {'pricerange': 'cheap', 'food': 'italian', 'area': 'north'}, got 0 results.\n" + ] + }, + { + "data": { + "text/plain": [ + "['Sorry there is no italian restaurant in the north of town.']" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model(['hi, i want some cheap italian food in the north of town'])" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-29 14:42:17.110 WARNING in 'deeppavlov.models.go_bot.bot'['bot'] at line 343: No database specified.\n", + "2018-06-29 14:42:17.110 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 344: Made api_call with {'pricerange': 'cheap', 'food': 'italian', 'area': 'north'}, got 0 results.\n" + ] + }, + { + "data": { + "text/plain": [ + "['Sorry there is no italian restaurant in the north of town.']" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model(['thanks, bye'])" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "model.reset() # resetting dialog context to start a new one" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-28 14:46:37.496 WARNING in 'deeppavlov.models.go_bot.bot'['bot'] at line 343: No database specified.\n", + "2018-06-28 14:46:37.497 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 344: Made api_call with {'pricerange': 'cheap', 'food': 'italian', 'area': 'north'}, got 0 results.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">> Sorry there is no italian restaurant in the north of town.\n", + ":: i want french food\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-28 14:46:51.212 WARNING in 'deeppavlov.models.go_bot.bot'['bot'] at line 343: No database specified.\n", + "2018-06-28 14:46:51.212 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 344: Made api_call with {'pricerange': 'cheap', 'food': 'french', 'area': 'north'}, got 0 results.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">> Sorry there is no french restaurant in the north of town.\n", + ":: ok, bye\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-28 14:46:54.950 WARNING in 'deeppavlov.models.go_bot.bot'['bot'] at line 343: No database specified.\n", + "2018-06-28 14:46:54.950 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 344: Made api_call with {'pricerange': 'cheap', 'food': 'french', 'area': 'north'}, got 0 results.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">> Sorry there is no french restaurant in the north of town.\n", + ":: exit\n" + ] + } + ], + "source": [ + "# if the cell is running, please do not run other cells in parallel -- there is a possibility of a hangup\n", + "\n", + "utterance = \"\"\n", + "while utterance != 'exit':\n", + " print(\">> \" + model([utterance])[0])\n", + " utterance = input(':: ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The model couldn't fill some slots. For example, #address, #phone, #postcode of a restaurant couldn't be inferred from user utterance. \n", + "\n", + "A list of available restaurants is required." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model `gobot_dstc2_db`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's now add a database with restaurants and train a new model:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initializing new config:" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "db_config = copy.deepcopy(simple_config)\n", + "\n", + "db_config['chainer']['pipe'] = []" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Creating database component config:" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "db_comp_config = {\n", + " 'name': 'sqlite_database',\n", + " 'id': 'restaurant_database', \n", + " 'save_path': 'dstc2_v2/resto.sqlite',\n", + " 'primary_keys': ['name'],\n", + " 'table_name': 'mytable'\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding vocab and database components to pipe:" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "db_config['chainer']['pipe'].append(vocab_comp_config)\n", + "db_config['chainer']['pipe'].append(db_comp_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initializing bot component config:" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [], + "source": [ + "bot_with_db_comp_config = copy.deepcopy(bot_comp_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**WARNING:** Do no forget to change `load_path` and `save_path` in neural network configuration when \n", + " training a new modification. Otherwise previous model's files will be overwritten." + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "bot_with_db_comp_config['network_parameters']['load_path'] = 'gobot_dstc2_db/model'\n", + "bot_with_db_comp_config['network_parameters']['save_path'] = 'gobot_dstc2_db/model'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding database to bot component config:" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "bot_with_db_comp_config['database'] = '#restaurant_database'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Addind bot component to pipe:" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "db_config['chainer']['pipe'].append(bot_with_db_comp_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "json.dump(db_config, open(\"gobot/db_config.json\", 'wt'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The new model now updates dialog state not only with entity values mentioned by user (\"i want cheap food\" -> {'pricerange': 'cheap'}), but also with restaurant info taken from sql database of restaurants.\n", + "\n", + "Model has a special action `api_call_action`, which initiates a request to sql database with current dialog state and thus receives info of a single matching restaurant.\n", + "\n", + "So now such slots as #address, #phone and #postcode can be filled in bot responses." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_evaluate_model_from_config(\"gobot/db_config.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = build_model_from_config(db_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">> Hello, welcome to the Cambridge restaurant system. You can ask for restaurants by area, price range or food type. How may I help you?\n", + ":: i want cheap restaurant in the north of town\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2018-06-28 14:57:43.780 INFO in 'deeppavlov.models.go_bot.bot'['bot'] at line 344: Made api_call with {'pricerange': 'cheap', 'area': 'north'}, got 2 results.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">> Da vinci pizzeria is a nice place in the north of town and the prices are cheap.\n", + ":: ok, give me their phone number\n", + ">> The phone number of da vinci pizzeria is 01223 351707.\n", + ":: post code too\n", + ">> Da vinci pizzeria is a nice place in the north of town and the prices are cheap.\n", + ":: give me their address\n", + ">> Da vinci pizzeria is a nice place in the north of town and the prices are cheap.\n", + ":: bye\n", + ">> Da vinci pizzeria is a nice place in the north of town and the prices are cheap.\n", + ":: exit\n" + ] + } + ], + "source": [ + "# if the cell is running, please do not run other cells in parallel -- there is a possibility of a hangup\n", + "\n", + "model.reset() # starting new dialog\n", + "\n", + "utterance = \"\"\n", + "while utterance != 'exit':\n", + " print(\">> \" + model([utterance])[0])\n", + " utterance = input(':: ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model `gobot_dstc2_emb`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's train a goal-oriented bot with fasttext embeddings:\n", + "\n", + "**NOTICE:** YOU NEED TO CONSTRUCT A NEW CONFIG YOURSELF" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initalizing new config:" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "emb_config = copy.deepcopy(db_config)\n", + "\n", + "emb_config['chainer']['pipe'] = []" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding vocab and database components to chainer pipe:" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "emb_config['chainer']['pipe'].append(vocab_comp_config)\n", + "emb_config['chainer']['pipe'].append(db_comp_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initalizing embedder component:" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "embedder_comp_config = {\n", + " 'id': 'my_embedder',\n", + " 'name': 'fasttext',\n", + " 'load_path': 'embeddings/dstc2_fastText_model.bin',\n", + " 'save_path': 'embeddings/dstc2_fastText_model.bin',\n", + " 'dim': 100\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: add embedder component to chainer pipe\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initializing bot component config:" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "bot_with_embedder_comp_config = copy.deepcopy(bot_with_db_comp_config)\n", + "\n", + "bot_with_embedder_comp_config['network_parameters']['load_path'] = 'gobot_dstc2_emb/model'\n", + "bot_with_embedder_comp_config['network_parameters']['save_path'] = 'gobot_dstc2_emb/model'" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: add #my_embedder to bot_with_embedder_comp_config\n" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: add bot_with_embedder_comp_config to chainer pipe\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These are download urls for new required data:" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "embedder_required_data = {\n", + " 'url': 'http://lnsigo.mipt.ru/export/deeppavlov_data/embeddings/dstc2_fastText_model.bin',\n", + " 'subdir': 'embeddings'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: add embedder download info to emb_config['metadata']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [], + "source": [ + "json.dump(emb_config, open(\"gobot/emb_config.json\", 'wt'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As far as we are now using embeddings, we added a file named `dstc2_fastText_model.bin` to `metadata.download` section. \n", + "\n", + "Let's run data loading again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "deep_download(['--config', 'gobot/emb_config.json'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_evaluate_model_from_config(\"gobot/emb_config.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = build_model_from_config(emb_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# if the cell is running, please do not run other cells in parallel -- there is a possibility of a hangup\n", + "\n", + "model.reset() # starting new dialog\n", + "\n", + "utterance = \"\"\n", + "while utterance != 'exit':\n", + " print(\">> \" + model([utterance])[0])\n", + " utterance = input(':: ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Appendix _(optional)_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Additional materials" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- [DataFest Presentation (RU)](https://docs.google.com/presentation/d/1PBPQp-wgQ6aRbm3MsuyGYB_TVg2c7Bf89lhdhbMtC2k)\n", + "- [CISS Video Lecture](https://youtu.be/uvH1zB7qahI)\n", + "- [Video Lecture \"Hybrid dialog bot\" (RU)](http://www.youtube.com/watch?v=JJCO7eWCy-M&t=331m19s)\n", + "- [Video Lecture \"What's inside a dialog system?\" (RU)](http://www.youtube.com/watch?v=JJCO7eWCy-M&t=259m55s)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model `gobot_dstc2_full`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's train a very smart goal-oriented bot that uses an attention mechanism over input embeddings \n", + "\n", + "(see https://medium.com/syncedreview/a-brief-overview-of-attention-mechanism-13c578ba9129 for more details):" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initializing new config:" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "emb_config = json.load(open(\"gobot/emb_config.json\", 'rt'))\n", + "\n", + "full_config = copy.deepcopy(emb_config)\n", + "full_config['chainer']['pipe'] = [\n", + " vocab_comp_config,\n", + " db_comp_config,\n", + " embedder_comp_config\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initializing bot component config:" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [], + "source": [ + "bot_with_emb_comp_config = emb_config['chainer']['pipe'][-1]\n", + "bot_with_attn_comp_config = copy.deepcopy(bot_with_emb_comp_config)\n", + "\n", + "bot_with_attn_comp_config['network_parameters']['load_path'] = 'gobot_dstc2_full/model'\n", + "bot_with_attn_comp_config['network_parameters']['save_path'] = 'gobot_dstc2_full/model'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding attention mechanism to bot:" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [], + "source": [ + "attention_mechanism_config = {\n", + " 'action_as_key': True,\n", + " 'depth': 3,\n", + " 'hidden_size': 32,\n", + " 'max_num_tokens': 100,\n", + " 'projected_align': False,\n", + " 'type': 'cs_bahdanau'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "bot_with_attn_comp_config['network_parameters']['attention_mechanism'] = attention_mechanism_config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding bot component to pipe:" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "full_config['chainer']['pipe'].append(bot_with_attn_comp_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [], + "source": [ + "json.dump(full_config, open(\"gobot/full_config.json\", 'wt'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_evaluate_model_from_config(\"gobot/full_config.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = build_model_from_config(full_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# if the cell is running, please do not run other cells in parallel -- there is a possibility of a hangup\n", + "\n", + "model.reset() # starting new dialog\n", + "\n", + "utterance = \"\"\n", + "while utterance != 'exit':\n", + " print(\">> \" + model([utterance])[0])\n", + " utterance = input(':: ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Another way of training and infering a component" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's build response token vocabulary, but do it without using deeppavlov scripts (without `train_evaluate_model_from_config` and `build_model_from_config`)." + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.data.vocab import DefaultVocabulary\n", + "from deeppavlov.dataset_readers.dstc2_reader import DSTC2Version2DatasetReader\n", + "from deeppavlov.dataset_iterators.dialog_iterator import DialogDatasetIterator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initializing a `DefaultVocabulary` class:" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [], + "source": [ + "y_vocab = DefaultVocabulary(level='token', \n", + " load_path='vocabs/y_token.dict', # path is relative to DEEPPAVLOV_ROOT/../download/ \n", + " save_path='vocabs/y_token.dict',\n", + " tokenizer=lambda s_batch: [s.split() for s in s_batch])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Important methods of any trained component are:\n", + "\n", + " - **\\_\\_init\\_\\_(self, *args, *\\*kwargs)**\n", + " - intializes a class instance\n", + "\n", + " - **fit(self, data, *args)** or **train_on_batch(self, batch, *args)**\n", + " - fits on full data or makes one training step on a batch of data\n", + "\n", + " - **\\_\\_call\\_\\_(self, batch, \\*\\*kwargs)**\n", + " - makes prediction (or infers) for each sample in a batch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Getting batches of data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = DSTC2Version2DatasetReader().read(data_path=\"tmp/my_download_of_dstc2\")\n", + "data_samples = DialogDatasetIterator(data, seed=1443, shuffle=True).get_instances(data_type='all')\n", + "x_list, y_list = data_samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Building vocabulary using y batches (`y_list` contains bot responses):" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [], + "source": [ + "y_vocab.fit(y_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Infering from (using) built vocabulary:" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[43, 3, 26, 5, 0]" + ] + }, + "execution_count": 130, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_vocab(['is', 'the', 'of', 'restaurant', 'hi'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To call a model `y_vocab(batch)` is the same as to call a \\_\\_call\\_\\_ method `y_vocab.__call__(batch)`!" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 198, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_vocab(['hi']) == y_vocab.__call__(['hi']) == [141]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tensorflow_kernel", + "language": "python", + "name": "tensorflow_kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/tutorials/03_deeppavlov_gobot.pdf b/examples/tutorials/03_deeppavlov_gobot.pdf new file mode 100644 index 0000000000..39457bcd2e Binary files /dev/null and b/examples/tutorials/03_deeppavlov_gobot.pdf differ diff --git a/examples/tutorials/04_deeppavlov_chitchat.ipynb b/examples/tutorials/04_deeppavlov_chitchat.ipynb new file mode 100644 index 0000000000..a28f3b13cb --- /dev/null +++ b/examples/tutorials/04_deeppavlov_chitchat.ipynb @@ -0,0 +1,1239 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DeepPavlov sequence-to-sequence tutorial" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we are going to implement sequence-to-sequence [[original paper]](https://arxiv.org/abs/1409.3215) model in DeepPavlov.\n", + "\n", + "Sequence-to-sequence is the concept of mapping input sequence to target sequence. Sequence-to-sequence models consist of two main components: encoder and decoder. Encoder is used to encode the input sequence to dense representation and decoder uses this dense representation to generate target sequence.\n", + "\n", + "![sequence-to-sequence](img/seq2seq.png)\n", + "\n", + "Here, input sequence is ABC, special token (end of sequence) is used as indicator to start decoding target sequence WXYZ.\n", + "\n", + "To implement this model in DeepPavlov we have to code some DeepPavlov abstractions:\n", + "* **DatasetReader** to read the data\n", + "* **DatasetIterator** to generate batches\n", + "* **Vocabulary** to convert words to indexes\n", + "* **Model** to train it and then use it\n", + "* and some other components for pre- and postprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import deeppavlov\n", + "import json\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "\n", + "from itertools import chain\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download & extract dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.data.utils import download_decompress\n", + "download_decompress('http://lnsigo.mipt.ru/export/datasets/personachat_v2.tar.gz', './personachat')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DatasetReader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DatasetReader is used to read and parse data from files. Here, we define new PersonaChatDatasetReader which reads [PersonaChat dataset](https://arxiv.org/abs/1801.07243). PersonaChat dataset consists of dialogs and user personalities.\n", + "\n", + "User personality is described by four sentences, e.g.:\n", + "\n", + " i like to remodel homes.\n", + " i like to go hunting.\n", + " i like to shoot a bow.\n", + " my favorite holiday is halloween." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.commands.train import build_model_from_config\n", + "from deeppavlov.core.data.dataset_reader import DatasetReader\n", + "from deeppavlov.core.data.utils import download_decompress\n", + "from deeppavlov.core.common.registry import register\n", + "\n", + "@register('personachat_dataset_reader')\n", + "class PersonaChatDatasetReader(DatasetReader):\n", + " \"\"\"\n", + " PersonaChat dataset from\n", + " Zhang S. et al. Personalizing Dialogue Agents: I have a dog, do you have pets too?\n", + " https://arxiv.org/abs/1801.07243\n", + " Also, this dataset is used in ConvAI2 http://convai.io/\n", + " This class reads dataset to the following format:\n", + " [{\n", + " 'persona': [list of persona sentences],\n", + " 'x': input utterance,\n", + " 'y': output utterance,\n", + " 'dialog_history': list of previous utterances\n", + " 'candidates': [list of candidate utterances]\n", + " 'y_idx': index of y utt in candidates list\n", + " },\n", + " ...\n", + " ]\n", + " \"\"\"\n", + " def read(self, dir_path: str, mode='self_original'):\n", + " dir_path = Path(dir_path)\n", + " dataset = {}\n", + " for dt in ['train', 'valid', 'test']:\n", + " dataset[dt] = self._parse_data(dir_path / '{}_{}.txt'.format(dt, mode))\n", + "\n", + " return dataset\n", + "\n", + " @staticmethod\n", + " def _parse_data(filename):\n", + " examples = []\n", + " print(filename)\n", + " curr_persona = []\n", + " curr_dialog_history = []\n", + " persona_done = False\n", + " with filename.open('r') as fin:\n", + " for line in fin:\n", + " line = ' '.join(line.strip().split(' ')[1:])\n", + " your_persona_pref = 'your persona: '\n", + " if line[:len(your_persona_pref)] == your_persona_pref and persona_done:\n", + " curr_persona = [line[len(your_persona_pref):]]\n", + " curr_dialog_history = []\n", + " persona_done = False\n", + " elif line[:len(your_persona_pref)] == your_persona_pref:\n", + " curr_persona.append(line[len(your_persona_pref):])\n", + " else:\n", + " persona_done = True\n", + " x, y, _, candidates = line.split('\\t')\n", + " candidates = candidates.split('|')\n", + " example = {\n", + " 'persona': curr_persona,\n", + " 'x': x,\n", + " 'y': y,\n", + " 'dialog_history': curr_dialog_history[:],\n", + " 'candidates': candidates,\n", + " 'y_idx': candidates.index(y)\n", + " }\n", + " curr_dialog_history.extend([x, y])\n", + " examples.append(example)\n", + "\n", + " return examples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = PersonaChatDatasetReader().read('./personachat')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Let's check dataset size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for k in data:\n", + " print(k, len(data[k]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data['train'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset iterator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dataset iterator is used to generate batches from parsed dataset (DatasetReader). Let's extract only *x* and *y* from parsed dataset and use them to predict sentence *y* by sentence *x*." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.data.data_learning_iterator import DataLearningIterator\n", + "\n", + "@register('personachat_iterator')\n", + "class PersonaChatIterator(DataLearningIterator):\n", + " def split(self, *args, **kwargs):\n", + " for dt in ['train', 'valid', 'test']:\n", + " setattr(self, dt, self._to_tuple(getattr(self, dt)))\n", + "\n", + " @staticmethod\n", + " def _to_tuple(data):\n", + " \"\"\"\n", + " Returns:\n", + " list of (x, y)\n", + " \"\"\"\n", + " return list(map(lambda x: (x['x'], x['y']), data))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's look on data in batches:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iterator = PersonaChatIterator(data)\n", + "batch = [el for el in iterator.gen_batches(5, 'train')][0]\n", + "for x, y in zip(*batch):\n", + " print('x:', x)\n", + " print('y:', y)\n", + " print('----------')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tokenizer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tokenizer is used to extract tokens from utterance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.models.preprocessors.lazy_tokenizer import LazyTokenizer\n", + "tokenizer = LazyTokenizer()\n", + "tokenizer(['Hello my friend'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Vocabulary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vocabulary prepares mapping from tokens to token indexes. It uses train data to build this mapping.\n", + "\n", + "We will implement DialogVocab (inherited from SimpleVocabulary) wich adds all tokens from *x* and *y* utterances to vocabulary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.data.simple_vocab import SimpleVocabulary\n", + "\n", + "@register('dialog_vocab')\n", + "class DialogVocab(SimpleVocabulary):\n", + " def fit(self, *args):\n", + " tokens = chain(*args)\n", + " super().fit(tokens)\n", + "\n", + " def __call__(self, batch, **kwargs):\n", + " indices_batch = []\n", + " for utt in batch:\n", + " tokens = [self[token] for token in utt]\n", + " indices_batch.append(tokens)\n", + " return indices_batch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create instance of DialogVocab. We define save and load paths, minimal frequence of tokens which are added to vocabulary and set of special tokens.\n", + "\n", + "Special tokens are:\n", + "* - padding\n", + "* - begin of sequence\n", + "* - end of sequence\n", + "* - unknown token - token which is not presented in vocabulary\n", + "\n", + "And fit it on tokens from *x* and *y*." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vocab = DialogVocab(\n", + " save_path='./vocab.dict',\n", + " load_path='./vocab.dict',\n", + " min_freq=2,\n", + " special_tokens=('','', '', '',),\n", + " unk_token=''\n", + ")\n", + "\n", + "vocab.fit(tokenizer(iterator.get_instances(data_type='train')[0]), tokenizer(iterator.get_instances(data_type='train')[1]))\n", + "vocab.save()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Top 10 most frequent tokens in train dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vocab.freqs.most_common(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Number of tokens in vocabulary:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(vocab)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use built vocabulary to encode some tokenized sentence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vocab([['', 'hello', 'my', 'friend', 'there_is_no_such_word_in_dataset', 'and_this', '', '']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Padding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To feed sequences of token indexes to neural model we should make their lengths equal. If sequence is too short we add symbols to the end of sequence. If sequence is too long we just cut it.\n", + "\n", + "SentencePadder implements such behavior, it also adds and tokens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.models.component import Component\n", + "\n", + "@register('sentence_padder')\n", + "class SentencePadder(Component):\n", + " def __init__(self, length_limit, pad_token_id=0, start_token_id=1, end_token_id=2, *args, **kwargs):\n", + " self.length_limit = length_limit\n", + " self.pad_token_id = pad_token_id\n", + " self.start_token_id = start_token_id\n", + " self.end_token_id = end_token_id\n", + "\n", + " def __call__(self, batch):\n", + " for i in range(len(batch)):\n", + " batch[i] = batch[i][:self.length_limit]\n", + " batch[i] = [self.start_token_id] + batch[i] + [self.end_token_id]\n", + " batch[i] += [self.pad_token_id] * (self.length_limit + 2 - len(batch[i]))\n", + " return batch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "padder = SentencePadder(length_limit=6)\n", + "vocab(padder(vocab([['hello', 'my', 'friend', 'there_is_no_such_word_in_dataset', 'and_this']])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Seq2Seq Model\n", + "Model consists of two main components: encoder and decoder. We can implement them independently and then put them together in one Seq2Seq model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Encoder\n", + "Encoder builds hidden representation of input sequence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def encoder(inputs, inputs_len, embedding_matrix, cell_size, keep_prob=1.0):\n", + " # inputs: tf.int32 tensor with shape bs x seq_len with token ids\n", + " # inputs_len: tf.int32 tensor with shape bs\n", + " # embedding_matrix: tf.float32 tensor with shape vocab_size x vocab_dim\n", + " # cell_size: hidden size of recurrent cell\n", + " # keep_prob: dropout keep probability\n", + " with tf.variable_scope('encoder'):\n", + " # first of all we should embed every token in input sequence (use tf.nn.embedding_lookup, don't forget about dropout)\n", + " x_emb = tf.nn.dropout(tf.nn.embedding_lookup(embedding_matrix, inputs), keep_prob=keep_prob)\n", + " \n", + " # define recurrent cell (LSTM or GRU)\n", + " encoder_cell = tf.nn.rnn_cell.GRUCell(\n", + " num_units=cell_size,\n", + " kernel_initializer=tf.contrib.layers.xavier_initializer(),\n", + " name='encoder_cell')\n", + " \n", + " # use tf.nn.dynamic_rnn to encode input sequence, use actual length of input sequence\n", + " encoder_outputs, encoder_state = tf.nn.dynamic_rnn(cell=encoder_cell, inputs=x_emb, sequence_length=inputs_len, dtype=tf.float32)\n", + " return encoder_outputs, encoder_state" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check your encoder implementation:\n", + "\n", + "next cell output shapes are\n", + "\n", + "32 x 10 x 100 and 32 x 100 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "vocab_size = 100\n", + "hidden_dim = 100\n", + "inputs = tf.cast(tf.random_uniform(shape=[32, 10]) * vocab_size, tf.int32) # bs x seq_len\n", + "mask = tf.cast(tf.random_uniform(shape=[32, 10]) * 2, tf.int32) # bs x seq_len\n", + "inputs_len = tf.reduce_sum(mask, axis=1)\n", + "embedding_matrix = tf.random_uniform(shape=[vocab_size, hidden_dim])\n", + "\n", + "encoder(inputs, inputs_len, embedding_matrix, hidden_dim)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Decoder\n", + "Decoder uses encoder outputs and encoder state to produce output sequence.\n", + "\n", + "Here, you should:\n", + "* define your decoder_cell (GRU or LSTM)\n", + "\n", + "it will be your baseline seq2seq model.\n", + "\n", + "\n", + "And, to improve the model:\n", + "* add Teacher Forcing\n", + "* add Attention Mechanism" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def decoder(encoder_outputs, encoder_state, embedding_matrix, mask,\n", + " cell_size, max_length, y_ph,\n", + " start_token_id=1, keep_prob=1.0,\n", + " teacher_forcing_rate_ph=None,\n", + " use_attention=False, is_train=True):\n", + " # decoder\n", + " # encoder_outputs: tf.float32 tensor with shape bs x seq_len x encoder_cell_size\n", + " # encoder_state: tf.float32 tensor with shape bs x encoder_cell_size\n", + " # embedding_matrix: tf.float32 tensor with shape vocab_size x vocab_dim\n", + " # mask: tf.int32 tensor with shape bs x seq_len with zeros for masked sequence elements\n", + " # cell_size: hidden size of recurrent cell\n", + " # max_length: max length of output sequence\n", + " # start_token_id: id of token in vocabulary\n", + " # keep_prob: dropout keep probability\n", + " # teacher_forcing_rate_ph: rate of using teacher forcing on each decoding step\n", + " # use_attention: use attention on encoder outputs or use only encoder_state\n", + " # is_train: is it training or inference? at inference time we can't use teacher forcing\n", + " with tf.variable_scope('decoder'):\n", + " # define decoder recurrent cell\n", + " decoder_cell = tf.nn.rnn_cell.GRUCell(\n", + " num_units=cell_size,\n", + " kernel_initializer=tf.contrib.layers.xavier_initializer(),\n", + " name='decoder_cell')\n", + " \n", + " # initial value of output_token on previsous step is start_token\n", + " output_token = tf.ones(shape=(tf.shape(encoder_outputs)[0],), dtype=tf.int32) * start_token_id\n", + " # let's define initial value of decoder state with encoder_state\n", + " decoder_state = encoder_state\n", + "\n", + " pred_tokens = []\n", + " logits = []\n", + "\n", + " # use for loop to sequentially call recurrent cell\n", + " for i in range(max_length):\n", + " \"\"\"\n", + " TEACHER FORCING\n", + " # here you can try to implement teacher forcing for your model\n", + " # details about teacher forcing are explained further in tutorial\n", + " \n", + " # pseudo code:\n", + " NOTE THAT FOLLOWING CONDITIONS SHOULD BE EVALUATED AT GRAPH RUNTIME\n", + " use tf.cond and tf.logical operations instead of python if\n", + " \n", + " if i > 0 and is_train and random_value < teacher_forcing_rate_ph:\n", + " input_token = y_ph[:, i-1]\n", + " else:\n", + " input_token = output_token\n", + "\n", + " input_token_emb = tf.nn.embedding_lookup(embedding_matrix, input_token)\n", + " \n", + " \"\"\"\n", + " if i > 0:\n", + " input_token_emb = tf.cond(\n", + " tf.logical_and(\n", + " is_train,\n", + " tf.random_uniform(shape=(), maxval=1) <= teacher_forcing_rate_ph\n", + " ),\n", + " lambda: tf.nn.embedding_lookup(embedding_matrix, y_ph[:, i-1]), # teacher forcing\n", + " lambda: tf.nn.embedding_lookup(embedding_matrix, output_token)\n", + " )\n", + " else:\n", + " input_token_emb = tf.nn.embedding_lookup(embedding_matrix, output_token)\n", + "\n", + " \"\"\"\n", + " ATTENTION MECHANISM\n", + " # here you can add attention to your model\n", + " # you can find details about attention further in tutorial\n", + " \"\"\" \n", + " if use_attention:\n", + " # compute attention and concat attention vector to input_token_emb\n", + " att = dot_attention(encoder_outputs, decoder_state, mask, scope='att')\n", + " input_token_emb = tf.concat([input_token_emb, att], axis=-1)\n", + "\n", + "\n", + " input_token_emb = tf.nn.dropout(input_token_emb, keep_prob=keep_prob)\n", + " # call recurrent cell\n", + " decoder_outputs, decoder_state = decoder_cell(input_token_emb, decoder_state)\n", + " decoder_outputs = tf.nn.dropout(decoder_outputs, keep_prob=keep_prob)\n", + " # project decoder output to embeddings dimension\n", + " embeddings_dim = embedding_matrix.get_shape()[1]\n", + " output_proj = tf.layers.dense(decoder_outputs, embeddings_dim, activation=tf.nn.tanh,\n", + " kernel_initializer=tf.contrib.layers.xavier_initializer(),\n", + " name='proj', reuse=tf.AUTO_REUSE)\n", + " # compute logits\n", + " output_logits = tf.matmul(output_proj, embedding_matrix, transpose_b=True)\n", + "\n", + " logits.append(output_logits)\n", + " output_probs = tf.nn.softmax(output_logits)\n", + " output_token = tf.argmax(output_probs, axis=-1)\n", + " pred_tokens.append(output_token)\n", + "\n", + " y_pred_tokens = tf.transpose(tf.stack(pred_tokens, axis=0), [1, 0])\n", + " y_logits = tf.transpose(tf.stack(logits, axis=0), [1, 0, 2])\n", + " return y_pred_tokens, y_logits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Output of next cell should be with shapes:\n", + "\n", + " 32 x 10\n", + " 32 x 10 x 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "vocab_size = 100\n", + "hidden_dim = 100\n", + "inputs = tf.cast(tf.random_uniform(shape=[32, 10]) * vocab_size, tf.int32) # bs x seq_len\n", + "mask = tf.cast(tf.random_uniform(shape=[32, 10]) * 2, tf.int32) # bs x seq_len\n", + "inputs_len = tf.reduce_sum(mask, axis=1)\n", + "embedding_matrix = tf.random_uniform(shape=[vocab_size, hidden_dim])\n", + "\n", + "teacher_forcing_rate = tf.random_uniform(shape=())\n", + "y = tf.cast(tf.random_uniform(shape=[32, 10]) * vocab_size, tf.int32)\n", + "\n", + "encoder_outputs, encoder_state = encoder(inputs, inputs_len, embedding_matrix, hidden_dim)\n", + "decoder(encoder_outputs, encoder_state, embedding_matrix, mask, hidden_dim, max_length=10,\n", + " y_ph=y, teacher_forcing_rate_ph=teacher_forcing_rate)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Seq2Seq model should be inherited from TFModel class and implement following methods:\n", + "* train_on_batch - this method is called in training phase\n", + "* \\_\\_call\\_\\_ - this method is called to make predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.models.tf_model import TFModel\n", + "\n", + "@register('seq2seq')\n", + "class Seq2Seq(TFModel):\n", + " def __init__(self, **kwargs):\n", + " # hyperparameters\n", + " \n", + " # dimension of word embeddings\n", + " self.embeddings_dim = kwargs.get('embeddings_dim', 100)\n", + " # size of recurrent cell in encoder and decoder\n", + " self.cell_size = kwargs.get('cell_size', 200)\n", + " # dropout keep_probability\n", + " self.keep_prob = kwargs.get('keep_prob', 0.8)\n", + " # learning rate\n", + " self.learning_rate = kwargs.get('learning_rate', 3e-04)\n", + " # max length of output sequence\n", + " self.max_length = kwargs.get('max_length', 20)\n", + " self.grad_clip = kwargs.get('grad_clip', 5.0)\n", + " self.start_token_id = kwargs.get('start_token_id', 1)\n", + " self.vocab_size = kwargs.get('vocab_size', 11595)\n", + " self.teacher_forcing_rate = kwargs.get('teacher_forcing_rate', 0.0)\n", + " self.use_attention = kwargs.get('use_attention', False)\n", + " \n", + " # create tensorflow session to run computational graph in it\n", + " self.sess_config = tf.ConfigProto(allow_soft_placement=True)\n", + " self.sess_config.gpu_options.allow_growth = True\n", + " self.sess = tf.Session(config=self.sess_config)\n", + " \n", + " self.init_graph()\n", + " \n", + " # define train op\n", + " self.train_op = self.get_train_op(self.loss, self.lr_ph,\n", + " optimizer=tf.train.AdamOptimizer,\n", + " clip_norm=self.grad_clip)\n", + " # initialize graph variables\n", + " self.sess.run(tf.global_variables_initializer())\n", + " \n", + " super().__init__(**kwargs)\n", + " # load saved model if there is one\n", + " if self.load_path is not None:\n", + " self.load()\n", + " \n", + " def init_graph(self):\n", + " # create placeholders\n", + " self.init_placeholders()\n", + "\n", + " self.x_mask = tf.cast(self.x_ph, tf.int32) \n", + " self.y_mask = tf.cast(self.y_ph, tf.int32) \n", + " \n", + " self.x_len = tf.reduce_sum(self.x_mask, axis=1)\n", + " \n", + " # create embeddings matrix for tokens\n", + " self.embeddings = tf.Variable(tf.random_uniform((self.vocab_size, self.embeddings_dim), -0.1, 0.1, name='embeddings'), dtype=tf.float32)\n", + "\n", + " # encoder\n", + " encoder_outputs, encoder_state = encoder(self.x_ph, self.x_len, self.embeddings, self.cell_size, self.keep_prob_ph)\n", + "\n", + " # decoder\n", + " self.y_pred_tokens, y_logits = decoder(encoder_outputs, encoder_state, self.embeddings, self.x_mask,\n", + " self.cell_size, self.max_length,\n", + " self.y_ph, self.start_token_id, self.keep_prob_ph,\n", + " self.teacher_forcing_rate_ph, self.use_attention, self.is_train_ph)\n", + " \n", + " # loss\n", + " self.y_ohe = tf.one_hot(self.y_ph, depth=self.vocab_size)\n", + " self.y_mask = tf.cast(self.y_mask, tf.float32)\n", + " self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.y_ohe, logits=y_logits) * self.y_mask\n", + " self.loss = tf.reduce_sum(self.loss) / tf.reduce_sum(self.y_mask)\n", + " \n", + " def init_placeholders(self):\n", + " # placeholders for inputs\n", + " self.x_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='x_ph')\n", + " # at inference time y_ph is used (y_ph exists in computational graph) when teacher forcing is activated, so we add dummy default value\n", + " # this dummy value is not actually used at inference\n", + " self.y_ph = tf.placeholder_with_default(tf.zeros_like(self.x_ph), shape=(None,None), name='y_ph')\n", + "\n", + " # placeholders for model parameters\n", + " self.lr_ph = tf.placeholder(dtype=tf.float32, shape=[], name='lr_ph')\n", + " self.keep_prob_ph = tf.placeholder_with_default(1.0, shape=[], name='keep_prob_ph')\n", + " self.is_train_ph = tf.placeholder_with_default(False, shape=[], name='is_train_ph')\n", + " self.teacher_forcing_rate_ph = tf.placeholder_with_default(0.0, shape=[], name='teacher_forcing_rate_ph')\n", + " \n", + " def _build_feed_dict(self, x, y=None):\n", + " feed_dict = {\n", + " self.x_ph: x,\n", + " }\n", + " if y is not None:\n", + " feed_dict.update({\n", + " self.y_ph: y,\n", + " self.lr_ph: self.learning_rate,\n", + " self.keep_prob_ph: self.keep_prob,\n", + " self.is_train_ph: True,\n", + " self.teacher_forcing_rate_ph: self.teacher_forcing_rate,\n", + " })\n", + " return feed_dict\n", + " \n", + " def train_on_batch(self, x, y):\n", + " feed_dict = self._build_feed_dict(x, y)\n", + " loss, _ = self.sess.run([self.loss, self.train_op], feed_dict=feed_dict)\n", + " return loss\n", + " \n", + " def __call__(self, x):\n", + " feed_dict = self._build_feed_dict(x)\n", + " y_pred = self.sess.run(self.y_pred_tokens, feed_dict=feed_dict)\n", + " return y_pred" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create model with random weights and default parameters, change path to model, otherwise it will be stored in deeppavlov/download folder:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s2s = Seq2Seq(\n", + " save_path='PATH_TO_YOUR_WORKING_DIR/model',\n", + " load_path='PATH_TO_YOUR_WORKING_DIR/model'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we firstly run all preprocessing steps and call seq2seq model, and then convert token indexes to tokens. As result we should get some random sequence of words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vocab(s2s(padder(vocab([['hello', 'my', 'friend', 'there_is_no_such_word_in_dataset', 'and_this']]))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Attention mechanism\n", + "Attention mechanism [[paper](https://arxiv.org/abs/1409.0473)] allows to aggregate information from \"memory\" according to current state. By aggregating we suppose weighted sum of \"memory\" items. Weight of each memory item depends on current state.\n", + "\n", + "Without attention decoder could use only last hidden state of encoder. Attention mechanism gives access to all encoder states during decoding.\n", + "\n", + "![attention](img/attention.png)\n", + "\n", + "One of the simpliest ways to compute attention weights (*a_ij*) is to compute them by dot product between memory items and state and then apply softmax function. Other ways of computing *multiplicative* attention could be found in this [paper](https://arxiv.org/abs/1508.04025).\n", + "\n", + "We also need a mask to skip some sequence elements like . To make weight of undesired memory items close to zero we can add big negative value to logits (result of dot product) before applying softmax." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def softmax_mask(values, mask):\n", + " # adds big negative to masked values\n", + " INF = 1e30\n", + " return -INF * (1 - tf.cast(mask, tf.float32)) + values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def dot_attention(memory, state, mask, scope=\"dot_attention\"):\n", + " # inputs: bs x seq_len x hidden_dim\n", + " # state: bs x hidden_dim\n", + " # mask: bs x seq_len\n", + " with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):\n", + " # dot product between each item in memory and state\n", + " logits = tf.matmul(memory, tf.expand_dims(state, axis=1), transpose_b=True)\n", + " logits = tf.squeeze(logits, [2])\n", + " \n", + " # apply mask to logits\n", + " logits = softmax_mask(logits, mask)\n", + " \n", + " # apply softmax to logits\n", + " att_weights = tf.expand_dims(tf.nn.softmax(logits), axis=2)\n", + " \n", + " # compute weighted sum of items in memory\n", + " att = tf.reduce_sum(att_weights * memory, axis=1)\n", + " return att" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check your implementation:\n", + "\n", + "outputs should be with shapes 32 x 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "memory = tf.random_normal(shape=[32, 10, 100]) # bs x seq_len x hidden_dim\n", + "state = tf.random_normal(shape=[32, 100]) # bs x hidden_dim\n", + "mask = tf.cast(tf.random_normal(shape=[32, 10]), tf.int32) # bs x seq_len\n", + "dot_attention(memory, state, mask)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Teacher forcing\n", + "\n", + "We have implemented decoder, which takes as input it's own output during training and inference time. But, at early stages of training it could be hard for model to produce long sequences depending on it's own close to random output. Teacher forcing can help with this: instead of feeding model's output we can feed ground truth tokens. It helps model on training time, but on inference we still can rely only on it's own output.\n", + "\n", + "\n", + "Using model's output:\n", + "\n", + "\"sampling\"\n", + "\n", + "Teacher forcing:\n", + "\n", + "\"teacher_forcing\"\n", + "\n", + "It is not necessary to feed ground truth tokens on each time step - we can randomly choose with some rate if we want ground truth input or predicted by model.\n", + "*teacher_forcing_rate* parameter of seq2seq model can control such behavior.\n", + "\n", + "More details about teacher forcing could be found in DeepLearningBook [Chapter 10.2.1](http://www.deeplearningbook.org/contents/rnn.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create model with random weights and default parameters:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we firstly run all preprocessing steps and call seq2seq model, and then convert token indexes to tokens. As result we should get some random sequence of words." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Postprocessing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In postprocessing step we are going to remove all , , tokens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@register('postprocessing')\n", + "class SentencePostprocessor(Component):\n", + " def __init__(self, pad_token='', start_token='', end_token='', *args, **kwargs):\n", + " self.pad_token = pad_token\n", + " self.start_token = start_token\n", + " self.end_token = end_token\n", + "\n", + " def __call__(self, batch):\n", + " for i in range(len(batch)):\n", + " batch[i] = ' '.join(self._postproc(batch[i]))\n", + " return batch\n", + " \n", + " def _postproc(self, utt):\n", + " if self.end_token in utt:\n", + " utt = utt[:utt.index(self.end_token)]\n", + " return utt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "postprocess = SentencePostprocessor()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "postprocess(vocab(s2s(padder(vocab([['hello', 'my', 'friend', 'there_is_no_such_word_in_dataset', 'and_this']])))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create config file\n", + "Let's put is all together in one config file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = {\n", + " \"dataset_reader\": {\n", + " \"name\": \"personachat_dataset_reader\",\n", + " \"data_path\": \"YOUR_PATH_TO_FOLDER_WITH_PERSONACHAT_DATASET\"\n", + " },\n", + " \"dataset_iterator\": {\n", + " \"name\": \"personachat_iterator\",\n", + " \"seed\": 1337,\n", + " \"shuffle\": True\n", + " },\n", + " \"chainer\": {\n", + " \"in\": [\"x\"],\n", + " \"in_y\": [\"y\"],\n", + " \"pipe\": [\n", + " {\n", + " \"name\": \"lazy_tokenizer\",\n", + " \"id\": \"tokenizer\",\n", + " \"in\": [\"x\"],\n", + " \"out\": [\"x_tokens\"]\n", + " },\n", + " {\n", + " \"name\": \"lazy_tokenizer\",\n", + " \"id\": \"tokenizer\",\n", + " \"in\": [\"y\"],\n", + " \"out\": [\"y_tokens\"]\n", + " },\n", + " {\n", + " \"name\": \"dialog_vocab\",\n", + " \"id\": \"vocab\",\n", + " \"save_path\": \"YOUR_PATH_TO_WORKING_DIR/vocab.dict\",\n", + " \"load_path\": \"YOUR_PATH_TO_WORKING_DIR/vocab.dict\",\n", + " \"min_freq\": 2,\n", + " \"special_tokens\": [\"\",\"\", \"\", \"\"],\n", + " \"unk_token\": \"\",\n", + " \"fit_on\": [\"x_tokens\", \"y_tokens\"],\n", + " \"in\": [\"x_tokens\"],\n", + " \"out\": [\"x_tokens_ids\"]\n", + " },\n", + " {\n", + " \"ref\": \"vocab\",\n", + " \"in\": [\"y_tokens\"],\n", + " \"out\": [\"y_tokens_ids\"]\n", + " },\n", + " {\n", + " \"name\": \"sentence_padder\",\n", + " \"id\": \"padder\",\n", + " \"length_limit\": 20,\n", + " \"in\": [\"x_tokens_ids\"],\n", + " \"out\": [\"x_tokens_ids\"]\n", + " },\n", + " {\n", + " \"ref\": \"padder\",\n", + " \"in\": [\"y_tokens_ids\"],\n", + " \"out\": [\"y_tokens_ids\"]\n", + " },\n", + " {\n", + " \"name\": \"seq2seq\",\n", + " \"id\": \"s2s\",\n", + " \"max_length\": \"#padder.length_limit+2\",\n", + " \"cell_size\": 250,\n", + " \"embeddings_dim\": 50,\n", + " \"vocab_size\": 11595,\n", + " \"keep_prob\": 0.8,\n", + " \"learning_rate\": 3e-04,\n", + " \"teacher_forcing_rate\": 0.0,\n", + " \"use_attention\": False,\n", + " \"save_path\": \"YOUR_PATH_TO_WORKING_DIR/model\",\n", + " \"load_path\": \"YOUR_PATH_TO_WORKING_DIR/model\",\n", + " \"in\": [\"x_tokens_ids\"],\n", + " \"in_y\": [\"y_tokens_ids\"],\n", + " \"out\": [\"y_predicted_tokens_ids\"],\n", + " },\n", + " {\n", + " \"ref\": \"vocab\",\n", + " \"in\": [\"y_predicted_tokens_ids\"],\n", + " \"out\": [\"y_predicted_tokens\"]\n", + " },\n", + " {\n", + " \"name\": \"postprocessing\",\n", + " \"in\": [\"y_predicted_tokens\"],\n", + " \"out\": [\"y_predicted_tokens\"]\n", + " }\n", + " ],\n", + " \"out\": [\"y_predicted_tokens\"]\n", + " },\n", + " \"train\": {\n", + " \"log_every_n_batches\": 100,\n", + " \"val_every_n_epochs\":0,\n", + " \"batch_size\": 64,\n", + " \"validation_patience\": 0,\n", + " \"epochs\": 20,\n", + " \"metrics\": [\"bleu\"],\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interact with model using config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.commands.infer import build_model_from_config\n", + "model = build_model_from_config(config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model(['Hi, how are you?', 'Any ideas my dear friend?'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run experiments with and without attention, with teacher forcing and without." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.commands.train import train_evaluate_model_from_config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "json.dump(config, open('seq2seq.json', 'w'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_evaluate_model_from_config('seq2seq.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = build_model_from_config(config)\n", + "model(['hi, how are you?', 'any ideas my dear friend?', 'okay, i agree with you', 'good bye!'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To improve the model you can try to use multilayer (use MultiRNNCell) encoder and decoder, try to use attention with trainable parameters (not dot product scoring function)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dp_tf1.8", + "language": "python", + "name": "dp_tf1.8" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/tutorials/04_deeppavlov_chitchat.pdf b/examples/tutorials/04_deeppavlov_chitchat.pdf new file mode 100644 index 0000000000..083a36e140 Binary files /dev/null and b/examples/tutorials/04_deeppavlov_chitchat.pdf differ diff --git a/examples/tutorials/README.md b/examples/tutorials/README.md new file mode 100644 index 0000000000..c89087ff2a --- /dev/null +++ b/examples/tutorials/README.md @@ -0,0 +1,31 @@ +# DeepPavlov tutorials + +## Introduction to DeepPavlov + +[Jupyter notebook](00_deeppavlov_intro.ipynb) | [slides](00_deeppavlov_intro.pdf) + +Install the library and understand a simple "Hello World!" Bot written in 7 lines of code. Experiment with basic pattern matching rule-based bot. + +## Data preparation in DeepPavlov + +[Jupyter notebook](01_deeppavlov_data.ipynb) + +Learn how to read and prepare data for trainable components. + +## Named Entity Recognition with DeepPavlov + +[Jupyter notebook](02_deeppavlov_ner.ipynb) | [slides](02_deeppavlov_ner.pdf) | [video](https://youtu.be/6HlL87PWxXU) + +Build a simple convolutional neural network to solve the named entity recognition task. Master data downloading, preprocessing and batching then train and score the model. + +## Task-oriented bot with DeepPavlov + +[Jupyter notebook](03_deeppavlov_gobot.ipynb) | [slides](03_deeppavlov_gobot.pdf) | [video](https://youtu.be/uvH1zB7qahI) + +Intro to DeepPavlov configs - a powerfull method to stack models. Study how to train 4 different task-oriented bots on DSTC2 dataset. These include (1) a basic bot, (2) a bot with a database of restaurants, (3) a bot with fasttext embeddings, (4) a bot with attention mechanism over input words. + +## Chit-chat bot with DeepPavlov + +[Jupyter notebook](04_deeppavlov_chitchat.ipynb) | [slides](04_deeppavlov_chitchat.pdf) | [video](https://youtu.be/G1TkCkoghC8) + +Implement in DeepPavlov sequence-to-sequence encoder-decoder model with attention mechanism and teacher forcing for chit-chat. diff --git a/examples/tutorials/img/attention.png b/examples/tutorials/img/attention.png new file mode 100644 index 0000000000..eb54d4ba47 Binary files /dev/null and b/examples/tutorials/img/attention.png differ diff --git a/examples/tutorials/img/bot_architecture00.png b/examples/tutorials/img/bot_architecture00.png new file mode 100644 index 0000000000..c2bad9fbba Binary files /dev/null and b/examples/tutorials/img/bot_architecture00.png differ diff --git a/examples/tutorials/img/bot_architecture01.png b/examples/tutorials/img/bot_architecture01.png new file mode 100644 index 0000000000..2018575019 Binary files /dev/null and b/examples/tutorials/img/bot_architecture01.png differ diff --git a/examples/tutorials/img/convolution.png b/examples/tutorials/img/convolution.png new file mode 100644 index 0000000000..d48358ae47 Binary files /dev/null and b/examples/tutorials/img/convolution.png differ diff --git a/examples/tutorials/img/sampling.png b/examples/tutorials/img/sampling.png new file mode 100644 index 0000000000..5538c35e81 Binary files /dev/null and b/examples/tutorials/img/sampling.png differ diff --git a/examples/tutorials/img/seq2seq.png b/examples/tutorials/img/seq2seq.png new file mode 100644 index 0000000000..0557fccaaa Binary files /dev/null and b/examples/tutorials/img/seq2seq.png differ diff --git a/examples/tutorials/img/teacher_forcing.png b/examples/tutorials/img/teacher_forcing.png new file mode 100644 index 0000000000..852d93da1a Binary files /dev/null and b/examples/tutorials/img/teacher_forcing.png differ diff --git a/requirements.txt b/requirements.txt index 7b9d79f001..b2c3be9046 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,26 +1,19 @@ Cython==0.27.1 -numpy==1.14.3 -lxml==4.1.1 -tqdm==4.19.5 -requests==2.18.4 -tensorflow==1.8.0 overrides==1.9 -kenlm==0.0.0 -h5py==2.7.1 -keras==2.1.2 -gensim==2.3.0 -pandas==0.21.1 -fuzzywuzzy==0.16.0 -git+https://github.com/facebookresearch/fastText.git@3872afadb3a9f30de7c7792ff2ff1bda64242097 +numpy==1.14.5 +pandas==0.23.1 nltk==3.2.5 -scikit-learn==0.19.0 -spacy==2.0.5 +tqdm==4.23.4 +scipy==1.1.0 +h5py==2.8.0 +keras==2.2.0 +scikit-learn==0.19.1 +fuzzywuzzy==0.16.0 +pymorphy2==0.8 +pymorphy2-dicts-ru +requests==2.19.1 pytelegrambotapi==3.5.2 -python-Levenshtein==0.12.0 flask==0.12.2 flasgger==0.6.6 flask_cors==3.0.3 -scipy==1.0.0 -pymorphy2==0.8 -pymorphy2-dicts-ru -sortedcontainers==2.0.2 \ No newline at end of file +rusenttokenize==0.0.4 \ No newline at end of file diff --git a/requirements/fasttext.txt b/requirements/fasttext.txt new file mode 100644 index 0000000000..327253433f --- /dev/null +++ b/requirements/fasttext.txt @@ -0,0 +1,2 @@ +pybind11==2.2.3 +git+https://github.com/facebookresearch/fastText.git@25d0bb04bf43d8b674fe9ae5722ef65a0856f5d6#egg=fastText \ No newline at end of file diff --git a/requirements/gensim.txt b/requirements/gensim.txt new file mode 100644 index 0000000000..ce61965790 --- /dev/null +++ b/requirements/gensim.txt @@ -0,0 +1 @@ +gensim==2.3.0 \ No newline at end of file diff --git a/requirements/spacy.txt b/requirements/spacy.txt new file mode 100644 index 0000000000..4b7eccd8d9 --- /dev/null +++ b/requirements/spacy.txt @@ -0,0 +1 @@ +spacy==2.0.5 \ No newline at end of file diff --git a/requirements/spelling.txt b/requirements/spelling.txt new file mode 100644 index 0000000000..bc6605c003 --- /dev/null +++ b/requirements/spelling.txt @@ -0,0 +1,4 @@ +lxml==4.1.1 +python-Levenshtein==0.12.0 +git+https://github.com/kpu/kenlm.git@328cc2995202e84d29e3773203d29cdd6cc07132#egg=kenlm +sortedcontainers==2.0.2 \ No newline at end of file diff --git a/requirements/tf-gpu.txt b/requirements/tf-gpu.txt new file mode 100644 index 0000000000..effcf7e687 --- /dev/null +++ b/requirements/tf-gpu.txt @@ -0,0 +1 @@ +tensorflow-gpu==1.8.0 \ No newline at end of file diff --git a/requirements/tf.txt b/requirements/tf.txt new file mode 100644 index 0000000000..3980ace68c --- /dev/null +++ b/requirements/tf.txt @@ -0,0 +1 @@ +tensorflow==1.8.0 \ No newline at end of file diff --git a/setup.py b/setup.py index b5a65643f7..d8190dbe4f 100644 --- a/setup.py +++ b/setup.py @@ -15,14 +15,7 @@ import os import re -try: # for pip>=10.0.0 - from pip._internal.req import parse_requirements - from pip._internal.download import PipSession - from pip._internal import main as pip_main -except ImportError: # for pip<=9.0.3 - from pip.req import parse_requirements - from pip.download import PipSession - from pip import main as pip_main +from utils.pip_wrapper import install __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) @@ -30,13 +23,20 @@ def read_requirements(): # # parses requirements from requirements.txt reqs_path = os.path.join(__location__, 'requirements.txt') - install_reqs = parse_requirements(reqs_path, session=PipSession()) - reqs = [] - for ir in install_reqs: - pip_main(['install', str(ir.req or ir.link)]) - if ir.req: - reqs.append(str(ir.req)) - return reqs + with open(reqs_path) as f: + reqs = [line.strip() for line in f if not line.strip().startswith('#')] + + for req in reqs: + install(req) + + names = [] + links = [] + for req in reqs: + if '://' in req: + links.append(req) + else: + names.append(req) + return {'install_requires': names, 'dependency_links': links} def readme(): @@ -46,7 +46,7 @@ def readme(): meta = {} -with open('deeppavlov/package_meta.py') as f: +with open(os.path.join(__location__, 'deeppavlov/package_meta.py')) as f: exec(f.read(), meta) setup( @@ -63,5 +63,5 @@ def readme(): download_url='https://github.com/deepmipt/DeepPavlov/archive/' + meta['__version__'] + '.tar.gz', keywords=['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot'], include_package_data=True, - install_requires=read_requirements() + **read_requirements() ) diff --git a/tests/test_configs/intents/intents_snips_bigru.json b/tests/test_configs/intents/intents_snips_bigru.json index 9f05093c61..7c54f52722 100644 --- a/tests/test_configs/intents/intents_snips_bigru.json +++ b/tests/test_configs/intents/intents_snips_bigru.json @@ -61,7 +61,7 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "intents/intent_cnn_snips_bigru", "load_path": "intents/intent_cnn_snips_bigru", "classes": "#classes_vocab.keys()", @@ -103,6 +103,10 @@ "test_best": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel" }, diff --git a/tests/test_configs/intents/intents_snips_bilstm.json b/tests/test_configs/intents/intents_snips_bilstm.json index 5496685fde..1698c293a4 100644 --- a/tests/test_configs/intents/intents_snips_bilstm.json +++ b/tests/test_configs/intents/intents_snips_bilstm.json @@ -61,7 +61,7 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "intents/intent_cnn_snips_bistlm", "load_path": "intents/intent_cnn_snips_bilstm", "classes": "#classes_vocab.keys()", @@ -103,6 +103,10 @@ "test_best": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel" }, diff --git a/tests/test_configs/intents/intents_snips_bilstm_bilstm.json b/tests/test_configs/intents/intents_snips_bilstm_bilstm.json index e40bbb0775..182d3c0f3f 100644 --- a/tests/test_configs/intents/intents_snips_bilstm_bilstm.json +++ b/tests/test_configs/intents/intents_snips_bilstm_bilstm.json @@ -61,7 +61,7 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "intents/intent_cnn_snips_bistlm_bilstm", "load_path": "intents/intent_cnn_snips_bilstm_bilstm", "classes": "#classes_vocab.keys()", @@ -104,6 +104,10 @@ "test_best": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel" }, diff --git a/tests/test_configs/intents/intents_snips_bilstm_cnn.json b/tests/test_configs/intents/intents_snips_bilstm_cnn.json index 82c13c89fe..47893e8138 100644 --- a/tests/test_configs/intents/intents_snips_bilstm_cnn.json +++ b/tests/test_configs/intents/intents_snips_bilstm_cnn.json @@ -61,7 +61,7 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "intents/intent_cnn_snips_bistlm_cnn", "load_path": "intents/intent_cnn_snips_bilstm_cnn", "classes": "#classes_vocab.keys()", @@ -110,6 +110,10 @@ "test_best": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel" }, diff --git a/tests/test_configs/intents/intents_snips_bilstm_self_add_attention.json b/tests/test_configs/intents/intents_snips_bilstm_self_add_attention.json index 6e0b5660d1..3e701373b7 100644 --- a/tests/test_configs/intents/intents_snips_bilstm_self_add_attention.json +++ b/tests/test_configs/intents/intents_snips_bilstm_self_add_attention.json @@ -61,7 +61,7 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "intents/intent_cnn_snips_bilstm_self_add_attention", "load_path": "intents/intent_cnn_snips_bilstm_self_add_attention", "classes": "#classes_vocab.keys()", @@ -105,6 +105,10 @@ "test_best": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel" }, diff --git a/tests/test_configs/intents/intents_snips_bilstm_self_mult_attention.json b/tests/test_configs/intents/intents_snips_bilstm_self_mult_attention.json index e707677f12..b8b4ed00a0 100644 --- a/tests/test_configs/intents/intents_snips_bilstm_self_mult_attention.json +++ b/tests/test_configs/intents/intents_snips_bilstm_self_mult_attention.json @@ -61,7 +61,7 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "intents/intent_cnn_snips_bilstm_self_mult_attention", "load_path": "intents/intent_cnn_snips_bilstm_self_mult_attention", "classes": "#classes_vocab.keys()", @@ -105,6 +105,10 @@ "test_best": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel" }, diff --git a/tests/test_configs/intents/intents_snips_cnn_bilstm.json b/tests/test_configs/intents/intents_snips_cnn_bilstm.json index affdef2f07..6ab121b81e 100644 --- a/tests/test_configs/intents/intents_snips_cnn_bilstm.json +++ b/tests/test_configs/intents/intents_snips_cnn_bilstm.json @@ -61,7 +61,7 @@ "y_probas_dict" ], "main": true, - "name": "intent_model", + "name": "keras_classification_model", "save_path": "intents/intent_cnn_snips_cnn_bistlm", "load_path": "intents/intent_cnn_snips_cnn_bilstm", "classes": "#classes_vocab.keys()", @@ -110,6 +110,10 @@ "test_best": false }, "metadata": { + "requirements": [ + "../requirements/tf.txt", + "../requirements/fasttext.txt" + ], "labels": { "telegram_utils": "IntentModel" }, diff --git a/deeppavlov/configs/odqa/odqa_infer_test.json b/tests/test_configs/odqa/en_odqa_infer_wiki_test.json similarity index 80% rename from deeppavlov/configs/odqa/odqa_infer_test.json rename to tests/test_configs/odqa/en_odqa_infer_wiki_test.json index c1e4c2a9c4..cc086cb204 100644 --- a/deeppavlov/configs/odqa/odqa_infer_test.json +++ b/tests/test_configs/odqa/en_odqa_infer_wiki_test.json @@ -28,8 +28,8 @@ "fit_on_batch": [ "questions_raw" ], - "save_path": "odqa/wiki_tfidf_matrix_test.npz", - "load_path": "odqa/wiki_tfidf_matrix_test.npz", + "save_path": "odqa/en_wiki_test_tfidf.npz", + "load_path": "odqa/en_wiki_test_tfidf.npz", "tokenizer": { "name": "stream_spacy_tokenizer", "lemmas": true, @@ -66,11 +66,16 @@ ] }, "metadata": { + "requirements": [ + "../requirements/tf-gpu.txt", + "../requirements/spacy.txt" + ], "labels": { "server_utils": "ODQA" }, "download": [ - "http://lnsigo.mipt.ru/export/deeppavlov_data/odqa.tar.gz", + "http://lnsigo.mipt.ru/export/datasets/wikipedia/wiki_test.tar.gz", + "http://lnsigo.mipt.ru/export/deeppavlov_data/odqa_test.tar.gz", "http://lnsigo.mipt.ru/export/deeppavlov_data/squad_model_1.1.tar.gz" ] } diff --git a/deeppavlov/configs/odqa/ranker_test.json b/tests/test_configs/ranking/en_ranker_tfidf_wiki_test.json similarity index 76% rename from deeppavlov/configs/odqa/ranker_test.json rename to tests/test_configs/ranking/en_ranker_tfidf_wiki_test.json index bd15e4c4b2..3a8d503eb7 100644 --- a/deeppavlov/configs/odqa/ranker_test.json +++ b/tests/test_configs/ranking/en_ranker_tfidf_wiki_test.json @@ -31,8 +31,8 @@ "fit_on_batch": [ "x" ], - "save_path": "odqa/wiki_tfidf_matrix_test.npz", - "load_path": "odqa/wiki_tfidf_matrix_test.npz", + "save_path": "odqa/en_wiki_test_tfidf.npz", + "load_path": "odqa/en_wiki_test_tfidf.npz", "tokenizer": { "name": "stream_spacy_tokenizer", "lemmas": true, @@ -51,11 +51,15 @@ "batch_size": 2 }, "metadata": { + "requirements": [ + "../requirements/spacy.txt" + ], "labels": { "server_utils": "Ranker" }, "download": [ - "http://lnsigo.mipt.ru/export/deeppavlov_data/odqa.tar.gz" + "http://lnsigo.mipt.ru/export/datasets/wikipedia/wiki_test.tar.gz", + "http://lnsigo.mipt.ru/export/deeppavlov_data/odqa_test.tar.gz" ] } } \ No newline at end of file diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index 63c6f6edce..17852c802a 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -21,7 +21,8 @@ download_path = tests_dir / "download" TEST_MODES = ['IP', # test_interacting_pretrained_model - 'TI' # test_consecutive_training_and_interacting + 'TI', # test_consecutive_training_and_interacting + 'E' # test_evolving ] ALL_MODES = ('IP', 'TI') @@ -59,7 +60,8 @@ ("intents/intents_dstc2.json", "intents", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], ("intents/intents_dstc2_big.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] }, - "snips": {("intents/intents_snips.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + "snips": { + ("intents/intents_snips.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("intents/intents_snips_bigru.json", "intents", ('TI')): [ONE_ARGUMENT_INFER_CHECK], ("intents/intents_snips_bilstm.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("intents/intents_snips_bilstm_bilstm.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], @@ -73,6 +75,9 @@ ("sentiment/sentiment_twitter.json", "sentiment", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], ("sentiment/sentiment_ag_news.json", "sentiment", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK] }, + "evolution": { + ("evolution/evolve_intents_snips.json", "evolution", ('E',)): None + }, "sample": { ("intents/intents_sample_csv.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("intents/intents_sample_json.json", "intents", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] @@ -89,15 +94,15 @@ ("moderate price range", "{'pricerange': 'moderate'}") ] }, - "ranking": {("ranking/ranking_insurance.json", "ranking", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK]}, + "ranking": {("ranking/ranking_insurance.json", "ranking", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/en_ranker_tfidf_wiki_test.json", "ranking", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK]}, "squad": { ("squad/squad.json", "squad_model", ALL_MODES): [TWO_ARGUMENTS_INFER_CHECK], ("squad/squad_ru.json", "squad_model_ru", ALL_MODES): [TWO_ARGUMENTS_INFER_CHECK] }, "seq2seq_go_bot": {("seq2seq_go_bot/bot_kvret.json", "seq2seq_go_bot", ALL_MODES): [FOUR_ARGUMENTS_INFER_CHECK]}, "odqa": { - ("odqa/ranker_test.json", "odqa", ()): [ONE_ARGUMENT_INFER_CHECK], - ("odqa/odqa_infer_test.json", "odqa", ()): [ONE_ARGUMENT_INFER_CHECK] + ("odqa/en_odqa_infer_wiki_test.json", "odqa", ('IP',)): [ONE_ARGUMENT_INFER_CHECK] }, "morpho_tagger/UD2.0/hu": {("morpho_tagger/UD2.0/hu/morpho_hu_train.json", "morpho_tagger", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK]} @@ -166,6 +171,16 @@ def teardown_module(): @pytest.mark.parametrize("model,conf_file,model_dir,mode", TEST_GRID, scope='class') class TestQuickStart(object): + @staticmethod + def install(conf_file): + logfile = io.BytesIO(b'') + _, exitstatus = pexpect.run(sys.executable + " -m deeppavlov install " + str(conf_file), timeout=None, + withexitstatus=True, + logfile=logfile) + if exitstatus != 0: + logfile.seek(0) + raise RuntimeError('Installing process of {} returned non-zero exit code: \n{}' + .format(conf_file, ''.join((line.decode() for line in logfile.readlines())))) @staticmethod def interact(conf_file, model_dir, qr_list=None): @@ -236,6 +251,7 @@ def interact_api(conf_file): def test_interacting_pretrained_model(self, model, conf_file, model_dir, mode): if 'IP' in mode: config_file_path = str(test_configs_path.joinpath(conf_file)) + self.install(config_file_path) deep_download(['-test', '-c', config_file_path]) self.interact(test_configs_path / conf_file, model_dir, PARAMS[model][(conf_file, model_dir, mode)]) @@ -258,6 +274,7 @@ def test_consecutive_training_and_interacting(self, model, conf_file, model_dir, if 'IP' not in mode: config_path = str(test_configs_path.joinpath(conf_file)) + self.install(config_path) deep_download(['-test', '-c', config_path]) shutil.rmtree(str(model_path), ignore_errors=True) @@ -273,3 +290,26 @@ def test_consecutive_training_and_interacting(self, model, conf_file, model_dir, shutil.rmtree(str(download_path), ignore_errors=True) else: pytest.skip("Unsupported mode: {}".format(mode)) + + def test_evolving(self, model, conf_file, model_dir, mode): + if 'E' in mode: + c = test_configs_path / conf_file + model_path = download_path / model_dir + + if 'IP' not in mode and 'TI' not in mode: + config_path = str(test_configs_path.joinpath(conf_file)) + deep_download(['-test', '-c', config_path]) + shutil.rmtree(str(model_path), ignore_errors=True) + + logfile = io.BytesIO(b'') + _, exitstatus = pexpect.run(sys.executable + " -m deeppavlov.evolve " + str(c) + " --iterations 1 --p_size 1", + timeout=None, withexitstatus=True, + logfile=logfile) + if exitstatus != 0: + logfile.seek(0) + raise RuntimeError('Training process of {} returned non-zero exit code: \n{}' + .format(model_dir, ''.join((line.decode() for line in logfile.readlines())))) + + shutil.rmtree(str(download_path), ignore_errors=True) + else: + pytest.skip("Unsupported mode: {}".format(mode)) diff --git a/utils/pip_wrapper/__init__.py b/utils/pip_wrapper/__init__.py new file mode 100644 index 0000000000..24cb413c4d --- /dev/null +++ b/utils/pip_wrapper/__init__.py @@ -0,0 +1 @@ +from .pip_wrapper import * \ No newline at end of file diff --git a/utils/pip_wrapper/pip_wrapper.py b/utils/pip_wrapper/pip_wrapper.py new file mode 100644 index 0000000000..8e3e013300 --- /dev/null +++ b/utils/pip_wrapper/pip_wrapper.py @@ -0,0 +1,50 @@ +import re +import subprocess +import sys +from pathlib import Path +import os + +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.file import read_json +from deeppavlov.core.common.log import get_logger + + +log = get_logger(__name__) + +_tf_re = re.compile(r'\s*tensorflow\s*([<=>;]|$)') +_spacy_re = re.compile(r'\s*spacy\s*([<=>;]|$)') + + +def install(*packages): + if any(_tf_re.match(package) for package in packages)\ + and b'tensorflow-gpu' in subprocess.check_output([sys.executable, '-m', 'pip', 'freeze'], + env=os.environ.copy()): + log.warn('found tensorflow-gpu installed, so upgrading it instead of tensorflow') + packages = [_tf_re.sub(r'tensorflow-gpu\1', package) for package in packages] + result = subprocess.check_call([sys.executable, '-m', 'pip', 'install', + *[re.sub(r'\s', '', package) for package in packages]], + env=os.environ.copy()) + if any(_spacy_re.match(package) for package in packages): + subprocess.check_call([sys.executable, '-m', 'spacy', 'download', 'en'], env=os.environ.copy()) + return result + + +def install_from_config(config: [str, Path, dict]): + if isinstance(config, (str, Path)): + config: dict = read_json(config) + requirements_files = config.get('metadata', {}).get('requirements', []) + + if not requirements_files: + log.warn('No requirements found in config') + return + + requirements = [] + for rf in requirements_files: + with expand_path(rf).open() as f: + for line in f: + line = re.sub(r'\s', '', line.strip()) + if line and not line.startswith('#') and line not in requirements: + requirements.append(line) + + for r in requirements: + install(r) diff --git a/deeppavlov/skills/seq2seq_go_bot/__init__.py b/utils/prepare/__init__.py similarity index 100% rename from deeppavlov/skills/seq2seq_go_bot/__init__.py rename to utils/prepare/__init__.py diff --git a/utils/prepare/registry.py b/utils/prepare/registry.py new file mode 100644 index 0000000000..017a86acf4 --- /dev/null +++ b/utils/prepare/registry.py @@ -0,0 +1,20 @@ +import pkgutil +import json + +import deeppavlov +from deeppavlov.core.common.registry import _registry_path as c_registry_path, _REGISTRY as C_REGISTRY +from deeppavlov.core.common.metrics_registry import _registry_path as m_registry_path, _REGISTRY as M_REGISTRY + + +if __name__ == '__main__': + C_REGISTRY.clear() + M_REGISTRY.clear() + + for _, pkg_name, _ in pkgutil.walk_packages(deeppavlov.__path__, deeppavlov.__name__+'.'): + __import__(pkg_name) + + with c_registry_path.open('w', encoding='utf-8') as f: + json.dump(dict(sorted(C_REGISTRY.items())), f, indent=2) + + with m_registry_path.open('w', encoding='utf-8') as f: + json.dump(dict(sorted(M_REGISTRY.items())), f, indent=2)