Release 0.1.6

deeppavlov · Jan 23, 2019 · 66c676c · 66c676c
2 parents 137c906 + d00cd59
commit 66c676c
Show file tree

Hide file tree

Showing 33 changed files with 347 additions and 92 deletions.
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ Give the floor to the HelloBot!
 print(HelloBot(['Hello!', 'Boo...', 'Bye.']))
 ```
 
-[Jupyther notebook with HelloBot example.](https://colab.research.google.com/github/deepmipt/DeepPavlov/blob/master/docs/intro/hello_bot.ipynb)
+[Jupyter notebook with HelloBot example.](https://colab.research.google.com/github/deepmipt/DeepPavlov/blob/master/docs/intro/hello_bot.ipynb)
 
 
 # Features

diff --git a/deeppavlov/__init__.py b/deeppavlov/__init__.py
@@ -35,7 +35,7 @@ def evaluate_model(config: [str, Path, dict], download: bool = False, recursive:
 except ImportError:
     'Assuming that requirements are not yet installed'
 
-__version__ = '0.1.5.1'
+__version__ = '0.1.6'
 __author__ = 'Neural Networks and Deep Learning lab, MIPT'
 __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.'
 __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot']

diff --git a/deeppavlov/configs/classifiers/rusentiment_cnn.json b/deeppavlov/configs/classifiers/rusentiment_cnn.json
@@ -152,14 +152,6 @@
       "server_utils": "KerasIntentModel"
     },
     "download": [
-      {
-        "url": "https://github.com/text-machine-lab/rusentiment/raw/master/Dataset/rusentiment_random_posts.csv",
-        "subdir": "{DOWNLOADS_PATH}/rusentiment"
-      },
-      {
-        "url": "https://github.com/text-machine-lab/rusentiment/raw/master/Dataset/rusentiment_test.csv",
-        "subdir": "{DOWNLOADS_PATH}/rusentiment"
-      },
       {
         "url": "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin",
         "subdir": "{DOWNLOADS_PATH}/embeddings"

diff --git a/deeppavlov/configs/classifiers/rusentiment_elmo.json b/deeppavlov/configs/classifiers/rusentiment_elmo.json
@@ -161,14 +161,6 @@
       "server_utils": "KerasIntentModel"
     },
     "download": [
-      {
-        "url": "https://github.com/text-machine-lab/rusentiment/raw/master/Dataset/rusentiment_random_posts.csv",
-        "subdir": "{DOWNLOADS_PATH}/rusentiment"
-      },
-      {
-        "url": "https://github.com/text-machine-lab/rusentiment/raw/master/Dataset/rusentiment_test.csv",
-        "subdir": "{DOWNLOADS_PATH}/rusentiment"
-      },
       {
         "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/rusentiment_v4.tar.gz",
         "subdir": "{ROOT_PATH}/models/classifiers"

diff --git a/deeppavlov/configs/odqa/en_odqa_infer_enwiki20161221.json b/deeppavlov/configs/odqa/en_odqa_infer_enwiki20161221.json
@@ -50,7 +50,8 @@
           "questions"
         ],
         "out": [
-          "best_answer"
+          "best_answer",
+          "best_answer_score"
         ]
       }
     ]

diff --git a/deeppavlov/configs/odqa/en_odqa_infer_wiki.json b/deeppavlov/configs/odqa/en_odqa_infer_wiki.json
@@ -50,7 +50,8 @@
           "questions"
         ],
         "out": [
-          "best_answer"
+          "best_answer",
+          "best_answer_score"
         ]
       }
     ]

diff --git a/deeppavlov/configs/odqa/en_odqa_pop_infer_enwiki20180211.json b/deeppavlov/configs/odqa/en_odqa_pop_infer_enwiki20180211.json
@@ -57,7 +57,8 @@
           "questions"
         ],
         "out": [
-          "best_answer"
+          "best_answer",
+          "best_answer_score"
         ]
       }
     ]

diff --git a/deeppavlov/configs/odqa/ru_odqa_infer_wiki.json b/deeppavlov/configs/odqa/ru_odqa_infer_wiki.json
@@ -43,14 +43,15 @@
       {
         "class_name": "logit_ranker",
         "batch_size": 10,
-        "squad_model": {"config_path": "{CONFIGS_PATH}/squad/multi_squad_noans_infer.json"},
+        "squad_model": {"config_path": "{CONFIGS_PATH}/squad/squad_ru.json"},
         "sort_noans": true,
         "in": [
           "chunks",
           "questions"
         ],
         "out": [
-          "best_answer"
+          "best_answer",
+          "best_answer_score"
         ]
       }
     ]

diff --git a/deeppavlov/contrib/__init__.py b/deeppavlov/contrib/__init__.py
diff --git a/deeppavlov/contrib/skills/__init__.py b/deeppavlov/contrib/skills/__init__.py
diff --git a/deeppavlov/contrib/skills/similarity_matching_skill/__init__.py b/deeppavlov/contrib/skills/similarity_matching_skill/__init__.py
@@ -0,0 +1 @@
+from .similarity_matching_skill import SimilarityMatchingSkill
diff --git a/deeppavlov/contrib/skills/similarity_matching_skill/similarity_matching_skill.py b/deeppavlov/contrib/skills/similarity_matching_skill/similarity_matching_skill.py
@@ -0,0 +1,82 @@
+from typing import Tuple, Optional, List
+
+from deeppavlov import train_model
+from deeppavlov import build_model
+from deeppavlov.core.commands.utils import expand_path
+from deeppavlov.core.common.log import get_logger
+from deeppavlov.core.skill.skill import Skill
+from deeppavlov.core.common.file import read_json
+from deeppavlov.configs import configs
+from deeppavlov.core.data.utils import update_dict_recursive
+
+log = get_logger(__name__)
+
+
+class SimilarityMatchingSkill(Skill):
+    """Skill, matches utterances to phrases, returns predefined answers.
+
+    Allows to create skills based on a .csv table that give a response to corresponding user's utterance
+    Skill returns response and confidence.
+
+    Args:
+        data_path: URL or local path to '.csv' file that contains two columns with Utterances and Responses.
+            User's utterance will be compared with Utterances column and response will be selected
+            from matching row from Responses column. 'http://files.deeppavlov.ai/faq/school/faq_school.csv' by default.
+        x_col_name: Name of the column in '.csv' file, that represents Utterances column. 'Question' by default.
+        y_col_name: Name of the column in '.csv' file, that represents Responses column. 'Answer' by default.
+        save_load_path: Path, where model will be saved or loaded from. './similarity_matching' by default.
+        edit_dict: Dictionary of edits in config (has higher prior, than previous arguments).
+        train: Should model be trained or not. True by default
+
+    Attributes:
+        model: Classifies user's utterance
+    """
+
+    def __init__(self, data_path: Optional[str] = None,
+                 x_col_name: Optional[str] = None, y_col_name: Optional[str] = None,
+                 save_load_path: Optional[str] = './similarity_matching',
+                 edit_dict: Optional[dict] = None, train: bool = True):
+
+        model_config = read_json(configs.faq.tfidf_autofaq)
+        if x_col_name is not None:
+            model_config['dataset_reader']['x_col_name'] = x_col_name
+        if y_col_name is not None:
+            model_config['dataset_reader']['y_col_name'] = y_col_name
+
+        model_config['metadata']['variables']['ROOT_PATH'] = save_load_path
+
+        if data_path is not None:
+            if expand_path(data_path).exists():
+                if 'data_url' in model_config['dataset_reader']:
+                    del model_config['dataset_reader']['data_url']
+                model_config['dataset_reader']['data_path'] = data_path
+            else:
+                if 'data_path' in model_config['dataset_reader']:
+                    del model_config['dataset_reader']['data_path']
+                model_config['dataset_reader']['data_url'] = data_path
+
+        if edit_dict is not None:
+            update_dict_recursive(model_config, edit_dict)
+
+        if train:
+            self.model = train_model(model_config)
+            log.info('Your model was saved at: \'' + save_load_path + '\'')
+        else:
+            self.model = build_model(model_config)
+
+    def __call__(self, utterances_batch: List[str], history_batch: List[List[str]],
+                 states_batch: Optional[list] = None) -> Tuple[List[str], List[float]]:
+        """Returns skill inference result.
+
+        Returns batches of skill inference results and estimated confidences
+
+        Args:
+            utterances_batch: A batch of utterances.
+            history_batch: A batch of list typed histories for each utterance.
+            states_batch: Optional. A batch of arbitrary typed states for
+                each utterance.
+
+        Returns:
+            Batches of skill inference results and estimated confidences
+        """
+        return self.model(utterances_batch)
diff --git a/deeppavlov/core/commands/train.py b/deeppavlov/core/commands/train.py
@@ -106,7 +106,10 @@ def fit_chainer(config: dict, iterator: Union[DataLearningIterator, DataFittingI
                 (('fit_on' in component_config) and
                  callable(getattr(component, 'partial_fit', None))):
             component: Estimator
-            targets = component_config.get('fit_on', component_config['fit_on_batch'])
+            try:
+                targets = component_config['fit_on']
+            except KeyError:
+                targets = component_config['fit_on_batch']
             if isinstance(targets, str):
                 targets = [targets]
 

diff --git a/deeppavlov/core/data/utils.py b/deeppavlov/core/data/utils.py
@@ -15,6 +15,7 @@
 """
 import gzip
 import os
+import collections
 import re
 import secrets
 import shutil
@@ -421,4 +422,23 @@ def jsonify_data(data):
 
 def chunk_generator(items_list, chunk_size):
     for i in range(0, len(items_list), chunk_size):
-        yield items_list[i:i + chunk_size]
+        yield items_list[i:i + chunk_size]
+
+
+def update_dict_recursive(editable_dict: dict, editing_dict: dict) -> None:
+    """Updates dict recursively
+
+    You need to use this function to update dictionary if depth of editing_dict is more then 1
+
+    Args:
+        editable_dict: dictionary, that will be edited
+        editing_dict: dictionary, that contains edits
+    Returns:
+        None
+    """
+    for k, v in editing_dict.items():
+        if isinstance(v, collections.Mapping):
+            update_dict_recursive(editable_dict.get(k, {}), v)
+        else:
+            editable_dict[k] = v
+
diff --git a/deeppavlov/models/doc_retrieval/logit_ranker.py b/deeppavlov/models/doc_retrieval/logit_ranker.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Union
+from typing import List, Union, Tuple
 from operator import itemgetter
+import warnings
 
 from deeppavlov.core.common.registry import register
 from deeppavlov.core.common.log import get_logger
@@ -45,7 +46,8 @@ def __init__(self, squad_model: Union[Chainer, Component], batch_size: int = 50,
         self.batch_size = batch_size
         self.sort_noans = sort_noans
 
-    def __call__(self, contexts_batch: List[List[str]], questions_batch: List[List[str]]) -> List[str]:
+    def __call__(self, contexts_batch: List[List[str]], questions_batch: List[List[str]]) -> \
+            Tuple[List[str], List[float]]:
         """
         Sort obtained results from squad reader by logits and get the answer with a maximum logit.
 
@@ -54,11 +56,16 @@ def __call__(self, contexts_batch: List[List[str]], questions_batch: List[List[s
             questions_batch: a batch of questions which should be treated as a single batch in the outer JSON config
 
         Returns:
-            a batch of best answers
+            a batch of best answers and their scores
 
         """
+        # TODO output result for top_n
+        warnings.warn(f'{self.__class__.__name__}.__call__() API will be changed in the future release.'
+                      ' Instead of returning Tuple(List[str], List[float] will return'
+                      ' Tuple(List[List[str]], List[List[float]]).', FutureWarning)
 
         batch_best_answers = []
+        batch_best_answers_scores = []
         for contexts, questions in zip(contexts_batch, questions_batch):
             results = []
             for i in range(0, len(contexts), self.batch_size):
@@ -70,6 +77,6 @@ def __call__(self, contexts_batch: List[List[str]], questions_batch: List[List[s
                 results = sorted(results, key=lambda x: (x[0] != '', x[2]), reverse=True)
             else:
                 results = sorted(results, key=itemgetter(2), reverse=True)
-            best_answer = results[0][0]
-            batch_best_answers.append(best_answer)
-        return batch_best_answers
+            batch_best_answers.append(results[0][0])
+            batch_best_answers_scores.append(results[0][2])
+        return batch_best_answers, batch_best_answers_scores
diff --git a/docs/_static/aws_ec2/01_login_to_aws.png b/docs/_static/aws_ec2/01_login_to_aws.png
diff --git a/docs/_static/aws_ec2/02_choose_ubuntu.png b/docs/_static/aws_ec2/02_choose_ubuntu.png
diff --git a/docs/_static/aws_ec2/03_select_instance_type.png b/docs/_static/aws_ec2/03_select_instance_type.png
diff --git a/docs/_static/aws_ec2/04_add_storage.png b/docs/_static/aws_ec2/04_add_storage.png
diff --git a/docs/_static/aws_ec2/05_review_instance.png b/docs/_static/aws_ec2/05_review_instance.png
diff --git a/docs/_static/aws_ec2/06_go_to_running_instances.png b/docs/_static/aws_ec2/06_go_to_running_instances.png
diff --git a/docs/_static/aws_ec2/07_wait_init.png b/docs/_static/aws_ec2/07_wait_init.png
diff --git a/docs/_static/aws_ec2/08_01_set_sec_group.png b/docs/_static/aws_ec2/08_01_set_sec_group.png
diff --git a/docs/_static/aws_ec2/08_02_set_inbound.png b/docs/_static/aws_ec2/08_02_set_inbound.png
diff --git a/docs/_static/aws_ec2/09_01_select_connect.png b/docs/_static/aws_ec2/09_01_select_connect.png
diff --git a/docs/_static/aws_ec2/09_02_connection_info.png b/docs/_static/aws_ec2/09_02_connection_info.png
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .similarity_matching_skill import SimilarityMatchingSkill