From 9fce17a150526d038d68528e937c72f1eb7f0bc6 Mon Sep 17 00:00:00 2001 From: Pavel Gulyaev Date: Tue, 10 Dec 2019 16:48:54 +0300 Subject: [PATCH 01/15] feat: add Chinese SQUAD on Chinese BERT config (#1092) * Add chinese BERT config * Append metrics to documentation * Add tests to chinese squads * Rename models * Fixed tabular markup in documentation --- ...d_zh_bert.json => squad_zh_bert_mult.json} | 0 .../configs/squad/squad_zh_bert_zh.json | 112 ++++++++++++++++++ docs/features/models/squad.rst | 23 ++++ docs/features/overview.rst | 5 + tests/test_quick_start.py | 4 +- 5 files changed, 143 insertions(+), 1 deletion(-) rename deeppavlov/configs/squad/{squad_zh_bert.json => squad_zh_bert_mult.json} (100%) create mode 100644 deeppavlov/configs/squad/squad_zh_bert_zh.json diff --git a/deeppavlov/configs/squad/squad_zh_bert.json b/deeppavlov/configs/squad/squad_zh_bert_mult.json similarity index 100% rename from deeppavlov/configs/squad/squad_zh_bert.json rename to deeppavlov/configs/squad/squad_zh_bert_mult.json diff --git a/deeppavlov/configs/squad/squad_zh_bert_zh.json b/deeppavlov/configs/squad/squad_zh_bert_zh.json new file mode 100644 index 0000000000..8a3d5e275f --- /dev/null +++ b/deeppavlov/configs/squad/squad_zh_bert_zh.json @@ -0,0 +1,112 @@ +{ + "dataset_reader": { + "class_name": "squad_dataset_reader", + "dataset": "SQuAD", + "url": "http://files.deeppavlov.ai/datasets/DRCD.tar.gz", + "data_path": "{DOWNLOADS_PATH}/DRCD_train/" + }, + "dataset_iterator": { + "class_name": "squad_iterator", + "seed": 1337, + "shuffle": true + }, + "chainer": { + "in": ["context_raw", "question_raw"], + "in_y": ["ans_raw", "ans_raw_start"], + "pipe": [ + { + "class_name": "bert_preprocessor", + "vocab_file": "{DOWNLOADS_PATH}/bert_models/chinese_L-12_H-768_A-12/vocab.txt", + "do_lower_case": "{lowercase}", + "max_seq_length": 384, + "in": ["question_raw", "context_raw"], + "out": ["bert_features"] + }, + { + "class_name": "squad_bert_mapping", + "do_lower_case": "{lowercase}", + "in": ["context_raw", "bert_features"], + "out": ["subtok2chars", "char2subtoks"] + }, + { + "class_name": "squad_bert_ans_preprocessor", + "do_lower_case": "{lowercase}", + "in": ["ans_raw", "ans_raw_start","char2subtoks"], + "out": ["ans", "ans_start", "ans_end"] + }, + { + "class_name": "squad_bert_model", + "bert_config_file": "{DOWNLOADS_PATH}/bert_models/chinese_L-12_H-768_A-12/bert_config.json", + "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/chinese_L-12_H-768_A-12/bert_model.ckpt", + "save_path": "{MODELS_PATH}/squad_zh_bert/model_zh", + "load_path": "{MODELS_PATH}/squad_zh_bert/model_zh", + "keep_prob": 0.5, + "learning_rate": 2e-05, + "learning_rate_drop_patience": 3, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["ans_start", "ans_end"], + "out": ["ans_start_predicted", "ans_end_predicted", "logits", "score"] + }, + { + "class_name": "squad_bert_ans_postprocessor", + "in": ["ans_start_predicted", "ans_end_predicted", "context_raw", "bert_features", "subtok2chars"], + "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"] + } + ], + "out": ["ans_predicted", "ans_start_predicted", "logits"] + }, + "train": { + "show_examples": false, + "test_best": false, + "validate_best": true, + "log_every_n_batches": 250, + "val_every_n_batches": 500, + "batch_size": 10, + "pytest_max_batches": 2, + "pytest_batch_size": 5, + "validation_patience": 10, + "metrics": [ + { + "name": "squad_v2_f1", + "inputs": ["ans", "ans_predicted"] + }, + { + "name": "squad_v2_em", + "inputs": ["ans", "ans_predicted"] + }, + { + "name": "squad_v1_f1", + "inputs": ["ans", "ans_predicted"] + }, + { + "name": "squad_v1_em", + "inputs": ["ans", "ans_predicted"] + } + ], + "tensorboard_log_dir": "{MODELS_PATH}/squad_zh_bert/logs" + }, + "metadata": { + "variables": { + "lowercase": false, + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/tf.txt", + "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/chinese_L-12_H-768_A-12.zip", + "subdir": "{DOWNLOADS_PATH}/bert_models" + }, + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/squad_model_zh_zhbert.tar.gz", + "subdir": "{MODELS_PATH}" + } + ] + } +} + diff --git a/docs/features/models/squad.rst b/docs/features/models/squad.rst index d8e58118f0..5d58394875 100644 --- a/docs/features/models/squad.rst +++ b/docs/features/models/squad.rst @@ -218,3 +218,26 @@ Link to SDSJ Task B dataset: http://files.deeppavlov.ai/datasets/sber_squad-v1.1 +------------------------------------------------------------------------+----------------+-----------------+ | :config:`DeepPavlov R-Net ` | 60.62 | 80.04 | +------------------------------------------------------------------------+----------------+-----------------+ + + +DRCD +~~~~~~~~~~~ + +Pretrained models are available and can be downloaded: + +.. code:: bash + + python -m deeppavlov download deeppavlov/configs/squad/squad_zh_bert.json + + python -m deeppavlov download deeppavlov/configs/squad/squad_zh_zh_bert.json + +Link to DRCD dataset: http://files.deeppavlov.ai/datasets/DRCD.tar.gz +Link to DRCD paper: https://arxiv.org/abs/1806.00920 + ++------------------------------------------------------------------------+----------------+-----------------+ +| Model config | EM (dev) | F-1 (dev) | ++========================================================================+================+=================+ +| :config:`DeepPavlov ChineseBERT ` | 85.13+-0.26 | 85.15+-0.12 | ++------------------------------------------------------------------------+----------------+-----------------+ +| :config:`DeepPavlov multilingual BERT ` | 84.18+-0.34 | 84.08+-0.07 | ++------------------------------------------------------------------------+----------------+-----------------+ diff --git a/docs/features/overview.rst b/docs/features/overview.rst index ae21c2cf6a..d887f31663 100644 --- a/docs/features/overview.rst +++ b/docs/features/overview.rst @@ -386,6 +386,10 @@ R-Net model is based on `R-NET: Machine Reading Comprehension with Self-matching +---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ | SDSJ Task B | :config:`DeepPavlov R-Net ` | ru | 60.62 | 80.04 | ~5Gb | +---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ +| `DRCD`_ | :config:`DeepPavlov multilingual BERT ` | ch | 84.18 | 84.08 | 630Mb | ++---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ +| `DRCD`_ | :config:`DeepPavlov Chinese BERT ` | ch | 85.13 | 85.15 | 362Mb | ++---------------+------------------------------------------------------------------------+-------+----------------+-----------------+-----------------+ In the case when answer is not necessary present in given context we have :config:`squad_noans ` model. This model outputs empty string in case if there is no answer in context. @@ -610,3 +614,4 @@ goal-oriented bot and a slot-filling model with Telegram UI. .. _`SQuAD-v1.1`: https://arxiv.org/abs/1606.05250 .. _`SDSJ Task B`: https://sdsj.sberbank.ai/2017/ru/contest.html +.. _`DRCD`: https://arxiv.org/abs/1806.00920 diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index 9e914fda9e..c758b30cf5 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -211,7 +211,9 @@ ("squad/squad_bert_infer.json", "squad_bert_infer", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK], ("squad/squad.json", "squad_model", ALL_MODES): [TWO_ARGUMENTS_INFER_CHECK], ("squad/squad_ru.json", "squad_model_ru", ALL_MODES): [TWO_ARGUMENTS_INFER_CHECK], - ("squad/multi_squad_noans.json", "multi_squad_noans", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK] + ("squad/multi_squad_noans.json", "multi_squad_noans", ('IP',)): [TWO_ARGUMENTS_INFER_CHECK], + ("squad/squad_zh_bert_mult.json", "squad_zh_bert_mult", ALL_MODES): [TWO_ARGUMENTS_INFER_CHECK], + ("squad/squad_zh_bert_zh.json", "squad_zh_bert_zh", ALL_MODES): [TWO_ARGUMENTS_INFER_CHECK] }, "seq2seq_go_bot": { ("seq2seq_go_bot/bot_kvret_train.json", "seq2seq_go_bot", ('TI',)): From 4a16bc17e0b42ad48b539347b1cf2e2ba31e85a3 Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Mon, 16 Dec 2019 11:28:38 +0300 Subject: [PATCH 02/15] fix: risesocket mode host name appropriate for Windows (#1096) --- tests/test_quick_start.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index c758b30cf5..987f07bff8 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -439,6 +439,7 @@ def interact_socket(config_path, socket_type): model_args_names = socket_params['model_args_names'] host = socket_params['host'] + host = host.replace('0.0.0.0', '127.0.0.1') port = api_port or socket_params['port'] socket_payload = {} From ba1af3a920ea373eb72f69a47a2f77525e7faf53 Mon Sep 17 00:00:00 2001 From: Anastasia Kravtsova <37542195+AnastasijaKravtsova@users.noreply.github.com> Date: Tue, 17 Dec 2019 18:05:44 +0300 Subject: [PATCH 03/15] docs: fix build_model doc for syntax_ru_syntagrus_bert (#1101) --- docs/features/models/syntaxparser.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/features/models/syntaxparser.rst b/docs/features/models/syntaxparser.rst index 60659cfbb2..59adb547cd 100644 --- a/docs/features/models/syntaxparser.rst +++ b/docs/features/models/syntaxparser.rst @@ -45,7 +45,7 @@ The example usage for inference is .. code:: python from deeppavlov import build_model, configs - model = build_model(configs.syntax_parser.syntax_ru_syntagrus_bert, download=True) + model = build_model(configs.syntax.syntax_ru_syntagrus_bert, download=True) sentences = ["Я шёл домой по незнакомой улице.", "Девушка пела в церковном хоре."] for parse in model(sentences): print(parse, end="\n\n") @@ -167,4 +167,4 @@ and dependency head. .. _`UD Pipe Future`: https://github.com/CoNLL-UD-2018/UDPipe-Future .. _`UDify (multilingual BERT)`: https://github.com/hyperparticle/udify -So our model is by a valuable margin the state-of-the-art system for Russian syntactic parsing. \ No newline at end of file +So our model is by a valuable margin the state-of-the-art system for Russian syntactic parsing. From 8590133501323513e56cac4c90991a20f3b082b9 Mon Sep 17 00:00:00 2001 From: Aleksei Lymar Date: Mon, 23 Dec 2019 14:09:41 +0300 Subject: [PATCH 04/15] docs: fix syntax parser headers levels (#1105) All syntax parser headers were in the top level of the table of contents --- docs/features/models/syntaxparser.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/features/models/syntaxparser.rst b/docs/features/models/syntaxparser.rst index 59adb547cd..7e3edffea0 100644 --- a/docs/features/models/syntaxparser.rst +++ b/docs/features/models/syntaxparser.rst @@ -30,7 +30,7 @@ other cues. Note also that syntactic relations (`nsubj`, `obj` and so one) have which makes syntactic parsing an appealing preprocessing step for the semantic-oriented tasks. Model usage -=========== +----------- Before using the model make sure that all required packages are installed using the command: @@ -83,7 +83,7 @@ For other usage options see the :doc:`morphological tagger documentation ` and source code. Model architecture -================== +------------------ We use BERT as the lowest layer of our model (the embedder). To extract syntactic information we apply the biaffine network of `[Dozat, Manning, 2017] `__. @@ -140,7 +140,7 @@ to return the optimal tree, using the open-source modification from `dependency_decoding package `. Model quality -============= +------------- Syntactic parsers are evaluated using two metrics: UAS (unlabeled attachment score), which is the percentage of correctly predicted head positions. The second metric is LAS (labeled attachment From c64e38691d8df2bf16ef2c3329262c2b63dfc60b Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Mon, 30 Dec 2019 14:39:34 +0300 Subject: [PATCH 05/15] feat: add agent-rabbit mode to interact with dp-agent via RabbitMQ (#1106) * feat: rabbit service is working * refactor: removed some redundant code, parameters moved to config file * feat: command line args * refactor: docstrings, redundant arguments * refactor: error handling and utterance inference * feat: agent-rabbit integration tests * feat: removed tests, changed default models args value * refactor: removed test class and added docstrings * refactor: docstring, var naming --- deeppavlov/core/commands/infer.py | 6 +- deeppavlov/deep.py | 27 ++- deeppavlov/utils/agent/__init__.py | 1 + deeppavlov/utils/agent/messages.py | 76 ++++++ deeppavlov/utils/agent/rabbitmq.py | 243 +++++++++++++++++++ deeppavlov/utils/agent/server.py | 90 +++++++ deeppavlov/utils/settings/server_config.json | 13 +- docs/index.rst | 1 + docs/integrations/dp_agent.rst | 64 +++++ requirements.txt | 1 + 10 files changed, 517 insertions(+), 5 deletions(-) create mode 100644 deeppavlov/utils/agent/__init__.py create mode 100644 deeppavlov/utils/agent/messages.py create mode 100644 deeppavlov/utils/agent/rabbitmq.py create mode 100644 deeppavlov/utils/agent/server.py create mode 100644 docs/integrations/dp_agent.rst diff --git a/deeppavlov/core/commands/infer.py b/deeppavlov/core/commands/infer.py index a02149e580..4d17492751 100644 --- a/deeppavlov/core/commands/infer.py +++ b/deeppavlov/core/commands/infer.py @@ -92,8 +92,12 @@ def interact_model(config: Union[str, Path, dict]) -> None: print('>>', *pred) -def predict_on_stream(config: Union[str, Path, dict], batch_size: int = 1, file_path: Optional[str] = None) -> None: +def predict_on_stream(config: Union[str, Path, dict], + batch_size: Optional[int] = None, + file_path: Optional[str] = None) -> None: """Make a prediction with the component described in corresponding configuration file.""" + + batch_size = batch_size or 1 if file_path is None or file_path == '-': if sys.stdin.isatty(): raise RuntimeError('To process data from terminal please use interact mode') diff --git a/deeppavlov/deep.py b/deeppavlov/deep.py index 29e6cca4a7..489e0932cf 100644 --- a/deeppavlov/deep.py +++ b/deeppavlov/deep.py @@ -20,6 +20,7 @@ from deeppavlov.core.common.cross_validation import calc_cv_score from deeppavlov.core.common.file import find_config from deeppavlov.download import deep_download +from deeppavlov.utils.agent import start_rabbit_service from deeppavlov.utils.alexa import start_alexa_server from deeppavlov.utils.alice import start_alice_server from deeppavlov.utils.ms_bot_framework import start_ms_bf_server @@ -34,14 +35,14 @@ parser.add_argument("mode", help="select a mode, train or interact", type=str, choices={'train', 'evaluate', 'interact', 'predict', 'telegram', 'msbot', 'alexa', 'alice', - 'riseapi', 'risesocket', 'download', 'install', 'crossval'}) + 'riseapi', 'risesocket', 'agent-rabbit', 'download', 'install', 'crossval'}) parser.add_argument("config_path", help="path to a pipeline json config", type=str) parser.add_argument("-e", "--start-epoch-num", dest="start_epoch_num", default=None, help="Start epoch number", type=int) parser.add_argument("--recursive", action="store_true", help="Train nested configs") -parser.add_argument("-b", "--batch-size", dest="batch_size", default=1, help="inference batch size", type=int) +parser.add_argument("-b", "--batch-size", dest="batch_size", default=None, help="inference batch size", type=int) parser.add_argument("-f", "--input-file", dest="file_path", default=None, help="Path to the input file", type=str) parser.add_argument("-d", "--download", action="store_true", help="download model components") @@ -58,9 +59,18 @@ parser.add_argument("-p", "--port", default=None, help="api port", type=int) -parser.add_argument("--socket-type", default='TCP', type=str, choices={"TCP", "UNIX"}) +parser.add_argument("--socket-type", default="TCP", type=str, choices={"TCP", "UNIX"}) parser.add_argument("--socket-file", default="/tmp/deeppavlov_socket.s", type=str) +parser.add_argument("-sn", "--service-name", default=None, help="service name for agent-rabbit mode", type=str) +parser.add_argument("-an", "--agent-namespace", default=None, help="dp-agent namespace name", type=str) +parser.add_argument("-ul", "--utterance-lifetime", default=None, help="message expiration in seconds", type=int) +parser.add_argument("-rh", "--rabbit-host", default=None, help="RabbitMQ server host", type=str) +parser.add_argument("-rp", "--rabbit-port", default=None, help="RabbitMQ server port", type=int) +parser.add_argument("-rl", "--rabbit-login", default=None, help="RabbitMQ server login", type=str) +parser.add_argument("-rpwd", "--rabbit-password", default=None, help="RabbitMQ server password", type=str) +parser.add_argument("-rvh", "--rabbit-virtualhost", default=None, help="RabbitMQ server virtualhost", type=str) + def main(): args = parser.parse_args() @@ -103,6 +113,17 @@ def main(): start_model_server(pipeline_config_path, args.https, args.key, args.cert, port=args.port) elif args.mode == 'risesocket': start_socket_server(pipeline_config_path, args.socket_type, port=args.port, socket_file=args.socket_file) + elif args.mode == 'agent-rabbit': + start_rabbit_service(model_config=pipeline_config_path, + service_name=args.service_name, + agent_namespace=args.agent_namespace, + batch_size=args.batch_size, + utterance_lifetime_sec=args.utterance_lifetime, + rabbit_host=args.rabbit_host, + rabbit_port=args.rabbit_port, + rabbit_login=args.rabbit_login, + rabbit_password=args.rabbit_password, + rabbit_virtualhost=args.rabbit_virtualhost) elif args.mode == 'predict': predict_on_stream(pipeline_config_path, args.batch_size, args.file_path) elif args.mode == 'install': diff --git a/deeppavlov/utils/agent/__init__.py b/deeppavlov/utils/agent/__init__.py new file mode 100644 index 0000000000..b737818baf --- /dev/null +++ b/deeppavlov/utils/agent/__init__.py @@ -0,0 +1 @@ +from .server import start_rabbit_service diff --git a/deeppavlov/utils/agent/messages.py b/deeppavlov/utils/agent/messages.py new file mode 100644 index 0000000000..e4326a8d24 --- /dev/null +++ b/deeppavlov/utils/agent/messages.py @@ -0,0 +1,76 @@ +# Copyright 2019 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Module contains classes defining messages received and sent by service via RabbitMQ message broker. + +The classes created to document the DeepPavlov Agent API and should match the corresponding classes +from https://github.com/deepmipt/dp-agent/blob/master/core/transport/messages.py + +""" + +from typing import Any + + +class MessageBase: + agent_name: str + msg_type: str + + def __init__(self, msg_type: str, agent_name: str) -> None: + self.msg_type = msg_type + self.agent_name = agent_name + + @classmethod + def from_json(cls, message_json: dict): + return cls(**message_json) + + def to_json(self) -> dict: + return self.__dict__ + + +class ServiceTaskMessage(MessageBase): + payload: dict + + def __init__(self, agent_name: str, payload: dict) -> None: + super().__init__('service_task', agent_name) + self.payload = payload + + +class ServiceResponseMessage(MessageBase): + response: Any + task_id: str + + def __init__(self, task_id: str, agent_name: str, response: Any) -> None: + super().__init__('service_response', agent_name) + self.task_id = task_id + self.response = response + + +def get_service_task_message(message_json: dict) -> ServiceTaskMessage: + """Creates an instance of ServiceTaskMessage class using its json representation. + + Args: + message_json: Dictionary with class fields. + + Returns: + New ServiceTaskMessage instance. + + Raises: + ValueError if dict with instance fields isn't from an instance of ServiceTaskMessage class. + + """ + message_type = message_json.pop('msg_type') + + if message_type != 'service_task': + raise TypeError(f'Unknown transport message type: {message_type}') + + return ServiceTaskMessage.from_json(message_json) diff --git a/deeppavlov/utils/agent/rabbitmq.py b/deeppavlov/utils/agent/rabbitmq.py new file mode 100644 index 0000000000..96feddfb8c --- /dev/null +++ b/deeppavlov/utils/agent/rabbitmq.py @@ -0,0 +1,243 @@ +# Copyright 2019 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import json +import logging +import time +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import aio_pika +from aio_pika import Connection, Channel, Exchange, Queue, IncomingMessage, Message + +from deeppavlov.core.commands.infer import build_model +from deeppavlov.core.common.chainer import Chainer +from deeppavlov.core.data.utils import jsonify_data +from deeppavlov.utils.agent.messages import ServiceTaskMessage, ServiceResponseMessage, get_service_task_message +from deeppavlov.utils.connector import DialogLogger +from deeppavlov.utils.server import get_server_params + +dialog_logger = DialogLogger(logger_name='agent_rabbit') +log = logging.getLogger(__name__) + +AGENT_IN_EXCHANGE_NAME_TEMPLATE = '{agent_namespace}_e_in' +AGENT_OUT_EXCHANGE_NAME_TEMPLATE = '{agent_namespace}_e_out' +AGENT_ROUTING_KEY_TEMPLATE = 'agent.{agent_name}' + +SERVICE_QUEUE_NAME_TEMPLATE = '{agent_namespace}_q_service_{service_name}' +SERVICE_ROUTING_KEY_TEMPLATE = 'service.{service_name}' + + +class RabbitMQServiceGateway: + """Class object connects to the RabbitMQ broker to process requests from the DeepPavlov Agent.""" + _add_to_buffer_lock: asyncio.Lock + _infer_lock: asyncio.Lock + _model: Chainer + _model_args_names: List[str] + _incoming_messages_buffer: List[IncomingMessage] + _batch_size: int + _utterance_lifetime_sec: int + _in_queue: Optional[Queue] + _connection: Connection + _agent_in_exchange: Exchange + _agent_out_exchange: Exchange + _agent_in_channel: Channel + _agent_out_channel: Channel + + def __init__(self, + model_config: Union[str, Path], + service_name: str, + agent_namespace: str, + batch_size: int, + utterance_lifetime_sec: int, + rabbit_host: str, + rabbit_port: int, + rabbit_login: str, + rabbit_password: str, + rabbit_virtualhost: str, + loop: asyncio.AbstractEventLoop) -> None: + self._add_to_buffer_lock = asyncio.Lock() + self._infer_lock = asyncio.Lock() + server_params = get_server_params(model_config) + self._model_args_names = server_params['model_args_names'] + self._model = build_model(model_config) + self._in_queue = None + self._utterance_lifetime_sec = utterance_lifetime_sec + self._batch_size = batch_size + self._incoming_messages_buffer = [] + + loop.run_until_complete(self._connect(loop=loop, host=rabbit_host, port=rabbit_port, login=rabbit_login, + password=rabbit_password, virtualhost=rabbit_virtualhost, + agent_namespace=agent_namespace)) + loop.run_until_complete(self._setup_queues(service_name, agent_namespace)) + loop.run_until_complete(self._in_queue.consume(callback=self._on_message_callback)) + + log.info(f'Service in queue started consuming') + + async def _connect(self, + loop: asyncio.AbstractEventLoop, + host: str, + port: int, + login: str, + password: str, + virtualhost: str, + agent_namespace: str) -> None: + """Connects to RabbitMQ message broker and initiates agent in and out channels and exchanges.""" + log.info('Starting RabbitMQ connection...') + + while True: + try: + self._connection = await aio_pika.connect_robust(loop=loop, + host=host, + port=port, + login=login, + password=password, + virtualhost=virtualhost) + log.info('RabbitMQ connected') + break + except ConnectionError: + reconnect_timeout = 5 + log.error(f'RabbitMQ connection error, making another attempt in {reconnect_timeout} secs') + time.sleep(reconnect_timeout) + + self._agent_in_channel = await self._connection.channel() + agent_in_exchange_name = AGENT_IN_EXCHANGE_NAME_TEMPLATE.format(agent_namespace=agent_namespace) + self._agent_in_exchange = await self._agent_in_channel.declare_exchange(name=agent_in_exchange_name, + type=aio_pika.ExchangeType.TOPIC) + log.info(f'Declared agent in exchange: {agent_in_exchange_name}') + + self._agent_out_channel = await self._connection.channel() + agent_out_exchange_name = AGENT_OUT_EXCHANGE_NAME_TEMPLATE.format(agent_namespace=agent_namespace) + self._agent_out_exchange = await self._agent_in_channel.declare_exchange(name=agent_out_exchange_name, + type=aio_pika.ExchangeType.TOPIC) + log.info(f'Declared agent out exchange: {agent_out_exchange_name}') + + def disconnect(self): + self._connection.close() + + async def _setup_queues(self, service_name: str, agent_namespace: str) -> None: + """Setups input queue to get messages from DeepPavlov Agent.""" + in_queue_name = SERVICE_QUEUE_NAME_TEMPLATE.format(agent_namespace=agent_namespace, + service_name=service_name) + + self._in_queue = await self._agent_out_channel.declare_queue(name=in_queue_name, durable=True) + log.info(f'Declared service in queue: {in_queue_name}') + + service_routing_key = SERVICE_ROUTING_KEY_TEMPLATE.format(service_name=service_name) + await self._in_queue.bind(exchange=self._agent_out_exchange, routing_key=service_routing_key) + log.info(f'Queue: {in_queue_name} bound to routing key: {service_routing_key}') + + await self._agent_out_channel.set_qos(prefetch_count=self._batch_size * 2) + + async def _on_message_callback(self, message: IncomingMessage) -> None: + """Processes messages from the input queue. + + Collects incoming messages to buffer, sends tasks batches for further processing. Depending on the success of + the processing result sends negative or positive acknowledgements to the input messages. + + """ + await self._add_to_buffer_lock.acquire() + self._incoming_messages_buffer.append(message) + log.debug('Incoming message received') + + if len(self._incoming_messages_buffer) < self._batch_size: + self._add_to_buffer_lock.release() + + await self._infer_lock.acquire() + try: + messages_batch = self._incoming_messages_buffer + valid_messages_batch: List[IncomingMessage] = [] + tasks_batch: List[ServiceTaskMessage] = [] + + if messages_batch: + self._incoming_messages_buffer = [] + + if self._add_to_buffer_lock.locked(): + self._add_to_buffer_lock.release() + + for message in messages_batch: + try: + task = get_service_task_message(json.loads(message.body, encoding='utf-8')) + tasks_batch.append(task) + valid_messages_batch.append(message) + except Exception as e: + log.error(f'Failed to get ServiceTaskMessage from the incoming message: {repr(e)}') + await message.reject() + + elif self._add_to_buffer_lock.locked(): + self._add_to_buffer_lock.release() + + if tasks_batch: + try: + await self._process_tasks(tasks_batch) + except Exception as e: + log.error(f'got exception while processing tasks: {repr(e)}') + for message in valid_messages_batch: + await message.reject() + else: + for message in valid_messages_batch: + await message.ack() + finally: + self._infer_lock.release() + + async def _process_tasks(self, tasks_batch: List[ServiceTaskMessage]) -> None: + """Gets from tasks batch payloads to infer model, processes them and creates tasks to send results.""" + task_uuids_batch, payloads = \ + zip(*[(task.payload['task_id'], task.payload['payload']) for task in tasks_batch]) + + log.debug(f'Prepared to infer tasks {", ".join(task_uuids_batch)}') + + responses_batch = await asyncio.wait_for(self._interact(payloads), + self._utterance_lifetime_sec) + + results_replies = [self._send_results(task, response) for task, response in zip(tasks_batch, responses_batch)] + await asyncio.gather(*results_replies) + + log.debug(f'Processed tasks {", ".join(task_uuids_batch)}') + + async def _interact(self, payloads: List[Dict]) -> List[Any]: + """Infers model with the batch.""" + batch = defaultdict(list) + + for payload in payloads: + for arg_name in self._model_args_names: + batch[arg_name].extend(payload.get(arg_name, [None])) + + dialog_logger.log_in(batch) + + prediction = self._model(*batch.values()) + if len(self._model.out_params) == 1: + prediction = [prediction] + prediction = list(zip(*prediction)) + result = jsonify_data(prediction) + + dialog_logger.log_out(result) + + return result + + async def _send_results(self, task: ServiceTaskMessage, response: Dict) -> None: + """Sends responses batch to the DeepPavlov Agent using agent input exchange.""" + result = ServiceResponseMessage(agent_name=task.agent_name, + task_id=task.payload["task_id"], + response=response) + + message = Message(body=json.dumps(result.to_json()).encode('utf-8'), + delivery_mode=aio_pika.DeliveryMode.PERSISTENT, + expiration=self._utterance_lifetime_sec) + + routing_key = AGENT_ROUTING_KEY_TEMPLATE.format(agent_name=task.agent_name) + await self._agent_in_exchange.publish(message=message, routing_key=routing_key) + log.debug(f'Sent response for task {str(task.payload["task_id"])} with routing key {routing_key}') diff --git a/deeppavlov/utils/agent/server.py b/deeppavlov/utils/agent/server.py new file mode 100644 index 0000000000..b9f3359af8 --- /dev/null +++ b/deeppavlov/utils/agent/server.py @@ -0,0 +1,90 @@ +# Copyright 2019 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +from pathlib import Path +from typing import Optional, Union + +from deeppavlov.core.common.file import read_json +from deeppavlov.core.common.paths import get_settings_path +from deeppavlov.utils.agent.rabbitmq import RabbitMQServiceGateway + +CONNECTOR_CONFIG_FILENAME = 'server_config.json' + + +def start_rabbit_service(model_config: Union[str, Path], + service_name: Optional[str] = None, + agent_namespace: Optional[str] = None, + batch_size: Optional[int] = None, + utterance_lifetime_sec: Optional[int] = None, + rabbit_host: Optional[str] = None, + rabbit_port: Optional[int] = None, + rabbit_login: Optional[str] = None, + rabbit_password: Optional[str] = None, + rabbit_virtualhost: Optional[str] = None) -> None: + """Launches DeepPavlov model receiving utterances and sending responses via RabbitMQ message broker. + + Args: + model_config: Path to DeepPavlov model to be launched. + service_name: Service name set in DeepPavlov Agent config. Used to format RabbitMQ exchanges, queues and routing + keys names. + agent_namespace: Service processes messages only from agents with the same namespace value. + batch_size: Limits the maximum number of utterances to be processed by service at one inference. + utterance_lifetime_sec: RabbitMQ message expiration time in seconds. + rabbit_host: RabbitMQ server host name. + rabbit_port: RabbitMQ server port number. + rabbit_login: RabbitMQ server administrator username. + rabbit_password: RabbitMQ server administrator password. + rabbit_virtualhost: RabbitMQ server virtualhost name. + + """ + service_config_path = get_settings_path() / CONNECTOR_CONFIG_FILENAME + service_config: dict = read_json(service_config_path)['agent-rabbit'] + + service_name = service_name or service_config['service_name'] + agent_namespace = agent_namespace or service_config['agent_namespace'] + batch_size = batch_size or service_config['batch_size'] + utterance_lifetime_sec = utterance_lifetime_sec or service_config['utterance_lifetime_sec'] + rabbit_host = rabbit_host or service_config['rabbit_host'] + rabbit_port = rabbit_port or service_config['rabbit_port'] + rabbit_login = rabbit_login or service_config['rabbit_login'] + rabbit_password = rabbit_password or service_config['rabbit_password'] + rabbit_virtualhost = rabbit_virtualhost or service_config['rabbit_virtualhost'] + + loop = asyncio.get_event_loop() + + gateway = RabbitMQServiceGateway( + model_config=model_config, + service_name=service_name, + agent_namespace=agent_namespace, + batch_size=batch_size, + utterance_lifetime_sec=utterance_lifetime_sec, + rabbit_host=rabbit_host, + rabbit_port=rabbit_port, + rabbit_login=rabbit_login, + rabbit_password=rabbit_password, + rabbit_virtualhost=rabbit_virtualhost, + loop=loop + ) + + try: + loop.run_forever() + except KeyboardInterrupt: + pass + finally: + gateway.disconnect() + loop.stop() + loop.close() + logging.shutdown() diff --git a/deeppavlov/utils/settings/server_config.json b/deeppavlov/utils/settings/server_config.json index 9949bf3a57..9fa2ebb2f3 100644 --- a/deeppavlov/utils/settings/server_config.json +++ b/deeppavlov/utils/settings/server_config.json @@ -2,7 +2,7 @@ "common_defaults": { "host": "0.0.0.0", "port": 5000, - "model_args_names": "", + "model_args_names": [], "https": false, "https_cert_path": "", "https_key_path": "", @@ -18,6 +18,17 @@ "next_argument_message": "Please enter an argument '{}'", "unsupported_message": "Unsupported message received." }, + "agent-rabbit": { + "service_name": "", + "agent_namespace": "deeppavlov_agent", + "batch_size": 1, + "utterance_lifetime_sec": 120, + "rabbit_host": "0.0.0.0", + "rabbit_port": 5672, + "rabbit_login": "guest", + "rabbit_password": "guest", + "rabbit_virtualhost": "/" + }, "deprecated": { "AlexaBot": { "amazon_cert_lifetime_secs": 3600, diff --git a/docs/index.rst b/docs/index.rst index 5c9b2d89bf..2f711a01a8 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -61,6 +61,7 @@ Welcome to DeepPavlov's documentation! REST API Socket API + DeepPavlov Agent RabbitMQ integration Telegram integration Yandex Alice integration Amazon Alexa integration diff --git a/docs/integrations/dp_agent.rst b/docs/integrations/dp_agent.rst new file mode 100644 index 0000000000..4f64923f2c --- /dev/null +++ b/docs/integrations/dp_agent.rst @@ -0,0 +1,64 @@ +DeepPavlov Agent RabbitMQ integration +===================================== + +Any model specified by a DeepPavlov config can be launched as a service for +`DeepPavlov Agent `_ +communicating with agent through RabbitMQ message broker. You can launch it +using command line interface or using python. + +Command line interface +~~~~~~~~~~~~~~~~~~~~~~ + +To run a model specified by the ```` config file as a DeepPavlov Agent service, run: + +.. code:: bash + + python -m deeppavlov agent-rabbit [-d] \ + [-sn ] \ + [-an ] \ + [-ans ] \ + [-b ] \ + [-ul ] \ + [-rp ] \ + [-rl ] \ + [-rpwd ] \ + [-rvh ] + +* ``-d``: download model specific data before starting the service. +* ``-sn ``: service name set in the connector section of the DeepPavlov Agent config file. +* ``-an ``: namespace the service works in. Messages only from agents from this namespace is processed. +* ``-b ``: inference batch size. +* ``-ul ``: RabbitMQ server host. +* ``-rp ``: RabbitMQ server port. +* ``-rl ``: RabbitMQ server login. +* ``-rpwd ``: RabbitMQ server password. +* ``-rvh ``: RabbitMQ server virtualhost. + +Default values of optional arguments can be modified via changing ``agent-rabbit`` section of the file +``deeppavlov/utils/settings/server_config.json``. + +Python interface +~~~~~~~~~~~~~~~~ + +To run a model specified by the ```` config file as a DeepPavlov Agent service using python, +run the following code: + +.. code:: python + + from deeppavlov.utils.agent import start_rabbit_service + + start_rabbit_service(model_config=, + service_name=, + agent_namespace=, + batch_size=, + utterance_lifetime_sec=, + rabbit_host=, + rabbit_port=, + rabbit_login=, + rabbit_password=, + rabbit_virtualhost=) + +All arguments except ```` are optional. Default values of optional arguments can be modified via changing +``agent-rabbit`` section of the file ``deeppavlov/utils/settings/server_config.json``. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 296d4840db..039a144feb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +aio-pika==5.6.0 Cython==0.29.12 fastapi==0.38.1 fuzzywuzzy==0.17.0 From 6ed1317580a348c0eb4708d46b61a6d516e1887f Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Thu, 16 Jan 2020 12:52:38 +0300 Subject: [PATCH 06/15] feat: add service error message type (#1112) --- deeppavlov/utils/agent/messages.py | 13 ++++++++++++ deeppavlov/utils/agent/rabbitmq.py | 32 +++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/deeppavlov/utils/agent/messages.py b/deeppavlov/utils/agent/messages.py index e4326a8d24..568b417722 100644 --- a/deeppavlov/utils/agent/messages.py +++ b/deeppavlov/utils/agent/messages.py @@ -55,6 +55,19 @@ def __init__(self, task_id: str, agent_name: str, response: Any) -> None: self.response = response +class ServiceErrorMessage(MessageBase): + formatted_exc: str + + def __init__(self, task_id: str, agent_name: str, formatted_exc: str) -> None: + super().__init__('error', agent_name) + self.task_id = task_id + self.formatted_exc = formatted_exc + + @property + def exception(self) -> Exception: + return Exception(self.formatted_exc) + + def get_service_task_message(message_json: dict) -> ServiceTaskMessage: """Creates an instance of ServiceTaskMessage class using its json representation. diff --git a/deeppavlov/utils/agent/rabbitmq.py b/deeppavlov/utils/agent/rabbitmq.py index 96feddfb8c..ea82955374 100644 --- a/deeppavlov/utils/agent/rabbitmq.py +++ b/deeppavlov/utils/agent/rabbitmq.py @@ -18,6 +18,7 @@ import time from collections import defaultdict from pathlib import Path +from traceback import format_exc from typing import Any, Dict, List, Optional, Union import aio_pika @@ -26,7 +27,8 @@ from deeppavlov.core.commands.infer import build_model from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.data.utils import jsonify_data -from deeppavlov.utils.agent.messages import ServiceTaskMessage, ServiceResponseMessage, get_service_task_message +from deeppavlov.utils.agent.messages import ServiceTaskMessage, ServiceResponseMessage, ServiceErrorMessage +from deeppavlov.utils.agent.messages import get_service_task_message from deeppavlov.utils.connector import DialogLogger from deeppavlov.utils.server import get_server_params @@ -184,7 +186,11 @@ async def _on_message_callback(self, message: IncomingMessage) -> None: try: await self._process_tasks(tasks_batch) except Exception as e: - log.error(f'got exception while processing tasks: {repr(e)}') + task_ids = [task.payload["task_id"] for task in tasks_batch] + log.error(f'got exception {repr(e)} while processing tasks {", ".join(task_ids)}') + formatted_exception = format_exc() + error_replies = [self._send_results(task, formatted_exception) for task in tasks_batch] + await asyncio.gather(*error_replies) for message in valid_messages_batch: await message.reject() else: @@ -228,11 +234,23 @@ async def _interact(self, payloads: List[Dict]) -> List[Any]: return result - async def _send_results(self, task: ServiceTaskMessage, response: Dict) -> None: - """Sends responses batch to the DeepPavlov Agent using agent input exchange.""" - result = ServiceResponseMessage(agent_name=task.agent_name, - task_id=task.payload["task_id"], - response=response) + async def _send_results(self, task: ServiceTaskMessage, response: Union[Dict, str]) -> None: + """Sends responses batch to the DeepPavlov Agent using agent input exchange. + + Args: + task: Task message from DeepPavlov Agent. + response: DeepPavlov model response (dict type) if infer was successful otherwise string representation of + raised error + + """ + if isinstance(response, dict): + result = ServiceResponseMessage(agent_name=task.agent_name, + task_id=task.payload["task_id"], + response=response) + else: + result = ServiceErrorMessage(agent_name=task.agent_name, + task_id=task.payload["task_id"], + formatted_exc=response) message = Message(body=json.dumps(result.to_json()).encode('utf-8'), delivery_mode=aio_pika.DeliveryMode.PERSISTENT, From 4a40bea1695ba2486af869b35b8ffd6db68715b4 Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Mon, 20 Jan 2020 12:32:18 +0300 Subject: [PATCH 07/15] feat: update uvicorn, fastapi and aio-pika versions (#1113) * feat: uvicorn, fastapi and aio-pika updated * fix: pydantic version update error fixed * feat: pydantic added to requirements --- deeppavlov/core/common/log.py | 16 +++++++++--- deeppavlov/utils/alexa/server.py | 4 +-- deeppavlov/utils/alice/server.py | 4 +-- deeppavlov/utils/ms_bot_framework/server.py | 4 +-- deeppavlov/utils/server/server.py | 29 +++++++-------------- deeppavlov/utils/settings/log_config.json | 17 ++++++++++-- requirements.txt | 7 ++--- 7 files changed, 47 insertions(+), 34 deletions(-) diff --git a/deeppavlov/core/common/log.py b/deeppavlov/core/common/log.py index 36b038d04a..30b4f57ec7 100644 --- a/deeppavlov/core/common/log.py +++ b/deeppavlov/core/common/log.py @@ -24,13 +24,21 @@ root_path = Path(__file__).resolve().parents[3] +log_config_path = get_settings_path() / LOG_CONFIG_FILENAME -def init_logger(): - log_config_path = get_settings_path() / LOG_CONFIG_FILENAME +with log_config_path.open(encoding='utf8') as log_config_json: + log_config = json.load(log_config_json) + + +class ProbeFilter(logging.Filter): + """ProbeFilter class is used to filter POST requests to /probe endpoint from logs.""" - with log_config_path.open(encoding='utf8') as log_config_json: - log_config = json.load(log_config_json) + def filter(self, record: logging.LogRecord) -> bool: + """To log the record method should return True.""" + return 'POST /probe HTTP' not in record.getMessage() + +def init_logger(): configured_loggers = [log_config.get('root', {})] + [logger for logger in log_config.get('loggers', {}).values()] diff --git a/deeppavlov/utils/alexa/server.py b/deeppavlov/utils/alexa/server.py index 18805c26ea..eff296f733 100644 --- a/deeppavlov/utils/alexa/server.py +++ b/deeppavlov/utils/alexa/server.py @@ -23,12 +23,12 @@ from fastapi import FastAPI from starlette.responses import JSONResponse +from deeppavlov.core.common.log import log_config from deeppavlov.utils.alexa.request_parameters import data_body, cert_chain_url_header, signature_header from deeppavlov.utils.connector import AlexaBot from deeppavlov.utils.server import get_ssl_params, redirect_root_to_docs, get_server_params log = getLogger(__name__) -uvicorn_log = getLogger('uvicorn') app = FastAPI() @@ -83,6 +83,6 @@ async def interact(data: dict = data_body, response_code = 400 if 'error' in response.keys() else 200 return JSONResponse(response, status_code=response_code) - uvicorn.run(app, host=host, port=port, logger=uvicorn_log, ssl_version=ssl_config.version, + uvicorn.run(app, host=host, port=port, log_config=log_config, ssl_version=ssl_config.version, ssl_keyfile=ssl_config.keyfile, ssl_certfile=ssl_config.certfile) bot.join() diff --git a/deeppavlov/utils/alice/server.py b/deeppavlov/utils/alice/server.py index 7c5a49f138..33efd6e46a 100644 --- a/deeppavlov/utils/alice/server.py +++ b/deeppavlov/utils/alice/server.py @@ -21,12 +21,12 @@ import uvicorn from fastapi import FastAPI +from deeppavlov.core.common.log import log_config from deeppavlov.utils.alice.request_parameters import data_body from deeppavlov.utils.connector import AliceBot from deeppavlov.utils.server import get_server_params, get_ssl_params, redirect_root_to_docs log = getLogger(__name__) -uvicorn_log = getLogger('uvicorn') app = FastAPI() @@ -60,6 +60,6 @@ async def answer(data: dict = data_body) -> dict: response: dict = await loop.run_in_executor(None, bot.output_queue.get) return response - uvicorn.run(app, host=host, port=port, logger=uvicorn_log, ssl_version=ssl_config.version, + uvicorn.run(app, host=host, port=port, log_config=log_config, ssl_version=ssl_config.version, ssl_keyfile=ssl_config.keyfile, ssl_certfile=ssl_config.certfile) bot.join() diff --git a/deeppavlov/utils/ms_bot_framework/server.py b/deeppavlov/utils/ms_bot_framework/server.py index 0b4d4fd868..325a8756a2 100644 --- a/deeppavlov/utils/ms_bot_framework/server.py +++ b/deeppavlov/utils/ms_bot_framework/server.py @@ -20,11 +20,11 @@ import uvicorn from fastapi import FastAPI +from deeppavlov.core.common.log import log_config from deeppavlov.utils.connector import MSBot from deeppavlov.utils.server import get_server_params, get_ssl_params, redirect_root_to_docs log = getLogger(__name__) -uvicorn_log = getLogger('uvicorn') app = FastAPI() @@ -55,6 +55,6 @@ async def answer(activity: dict) -> dict: bot.input_queue.put(activity) return {} - uvicorn.run(app, host=host, port=port, logger=uvicorn_log, ssl_version=ssl_config.version, + uvicorn.run(app, host=host, port=port, log_config=log_config, ssl_version=ssl_config.version, ssl_keyfile=ssl_config.keyfile, ssl_certfile=ssl_config.certfile) bot.join() diff --git a/deeppavlov/utils/server/server.py b/deeppavlov/utils/server/server.py index f05655ddc1..751bda4814 100644 --- a/deeppavlov/utils/server/server.py +++ b/deeppavlov/utils/server/server.py @@ -13,8 +13,8 @@ # limitations under the License. import asyncio -import logging from collections import namedtuple +from logging import getLogger from pathlib import Path from ssl import PROTOCOL_TLSv1_2 from typing import Dict, List, Optional, Union @@ -23,8 +23,8 @@ from fastapi import Body, FastAPI, HTTPException from fastapi.utils import generate_operation_id_for_path from pydantic import BaseConfig, BaseModel, Schema -from pydantic.fields import Field -from pydantic.main import MetaModel +from pydantic.fields import Field, ModelField +from pydantic.main import ModelMetaclass from starlette.middleware.cors import CORSMiddleware from starlette.responses import RedirectResponse @@ -32,6 +32,7 @@ from deeppavlov.core.commands.utils import parse_config from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.common.file import read_json +from deeppavlov.core.common.log import log_config from deeppavlov.core.common.paths import get_settings_path from deeppavlov.core.data.utils import check_nested_dict_keys, jsonify_data from deeppavlov.utils.connector import DialogLogger @@ -40,17 +41,9 @@ SSLConfig = namedtuple('SSLConfig', ['version', 'keyfile', 'certfile']) -class ProbeFilter(logging.Filter): - """ProbeFilter class is used to filter POST requests to /probe endpoint from logs.""" - - def filter(self, record: logging.LogRecord) -> bool: - """To log the record method should return True.""" - return 'POST /probe HTTP' not in record.getMessage() - +log = getLogger(__name__) +dialog_logger = DialogLogger(logger_name='rest_api') -log = logging.getLogger(__name__) -uvicorn_log = logging.getLogger('uvicorn') -uvicorn_log.addFilter(ProbeFilter()) app = FastAPI(__file__) app.add_middleware( @@ -61,8 +54,6 @@ def filter(self, record: logging.LogRecord) -> bool: allow_headers=['*'] ) -dialog_logger = DialogLogger(logger_name='rest_api') - def get_server_params(model_config: Union[str, Path]) -> Dict: server_config = read_json(SERVER_CONFIG_PATH) @@ -180,10 +171,10 @@ def start_model_server(model_config: Path, model = build_model(model_config) - def batch_decorator(cls: MetaModel) -> MetaModel: + def batch_decorator(cls: ModelMetaclass) -> ModelMetaclass: cls.__annotations__ = {arg_name: list for arg_name in model_args_names} - cls.__fields__ = {arg_name: Field(name=arg_name, type_=list, class_validators=None, - model_config=BaseConfig, required=False, schema=Schema(None)) + cls.__fields__ = {arg_name: ModelField(name=arg_name, type_=list, class_validators=None, + model_config=BaseConfig, required=False, field_info=Field(None)) for arg_name in model_args_names} return cls @@ -209,5 +200,5 @@ async def probe(item: Batch) -> List[str]: async def api() -> List[str]: return model_args_names - uvicorn.run(app, host=host, port=port, logger=uvicorn_log, ssl_version=ssl_config.version, + uvicorn.run(app, host=host, port=port, log_config=log_config, ssl_version=ssl_config.version, ssl_keyfile=ssl_config.keyfile, ssl_certfile=ssl_config.certfile, timeout_keep_alive=20) diff --git a/deeppavlov/utils/settings/log_config.json b/deeppavlov/utils/settings/log_config.json index 0b5ed38b0e..73ea3bd49f 100644 --- a/deeppavlov/utils/settings/log_config.json +++ b/deeppavlov/utils/settings/log_config.json @@ -9,7 +9,14 @@ ], "propagate": true }, - "uvicorn": { + "uvicorn.access": { + "level": "INFO", + "handlers": [ + "uvicorn_handler" + ], + "propagate": true + }, + "uvicorn.error": { "level": "INFO", "handlers": [ "uvicorn_handler" @@ -50,7 +57,13 @@ "class": "logging.StreamHandler", "level": "INFO", "formatter": "uvicorn_fmt", - "stream": "ext://sys.stdout" + "stream": "ext://sys.stdout", + "filters": ["probeFilter"] + } + }, + "filters": { + "probeFilter": { + "()": "deeppavlov.core.common.log.ProbeFilter" } } } diff --git a/requirements.txt b/requirements.txt index 039a144feb..0467d71d15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -aio-pika==5.6.0 +aio-pika==6.4.1 Cython==0.29.12 -fastapi==0.38.1 +fastapi==0.46.0 fuzzywuzzy==0.17.0 h5py==2.9.0 keras==2.2.4 @@ -8,6 +8,7 @@ nltk==3.2.5 numpy==1.16.4 overrides==1.9 pandas==0.24.2 +pydantic==1.3 pymorphy2==0.8 pymorphy2-dicts-ru pyopenssl==19.0.0 @@ -17,4 +18,4 @@ rusenttokenize==0.0.5 scikit-learn==0.21.2 scipy==1.3.0 tqdm==4.32.2 -uvicorn==0.9.0 \ No newline at end of file +uvicorn==0.11.1 \ No newline at end of file From 8820084c757478875e40c6198570dfdfdb70ddc1 Mon Sep 17 00:00:00 2001 From: Aleksei Lymar Date: Mon, 27 Jan 2020 12:47:55 +0300 Subject: [PATCH 08/15] refactor: tf.keras (#1115) * feat: improve datasets for paraphrases * feat: user tf.keras instead of keras in keras_layers * refactor: create a base class for matching keras models * feat: user tf.keras instead of keras in keras_model * refactor: use tf.keras instead of keras * refactor: simplify classification model's __call__ method * refactor: use tf.keras instead of keras in morpho-tagger cells * refactor: use tf.keras instead of keras in morpho-tagger * refactor: use tf.keras instead of keras in bilstm_siamese_network * refactor: use tf.keras instead of keras in keras_siamese_model * refactor: use tf.keras instead of keras in mpm_siamese_network * refactor: use tf.keras instead of keras * fix: correctly split data in siamese_iterator * fix: user graph-wrapped __call__() calls in chainer for tf and keras * tests: use live ranking configs instead of test ones * chore: bump tf version to 1.15.0 * docs: autodocument KerasClassificationModel methods * fix: paraphraser_reader returns two classes instead of three again * feat: use fasttext==0.9.1 from pip * fix: add fasttext to autodoc_mock_imports * chore: update requirements * fix: use sacremoses instead of nltk moses * fix: add sacremoses to autodoc_mock_imports * fix: rollback fastapi and uvicorn versions * fix: rollback scikit-learn version * refactor: use activations from tf.keras instead of importing them * chore: remove an obsolete code comment Co-Authored-By: puleon * docs: correct f1 value for ner_ontonotes config * fix: typo Co-Authored-By: yurakuratov <9271630+yurakuratov@users.noreply.github.com> Co-authored-by: puleon Co-authored-by: yurakuratov <9271630+yurakuratov@users.noreply.github.com> --- .../configs/ranking/ranking_insurance.json | 3 +- .../ranking/ranking_insurance_interact.json | 6 +- deeppavlov/core/common/chainer.py | 4 +- deeppavlov/core/layers/keras_layers.py | 100 +++--------- deeppavlov/core/models/keras_model.py | 18 +-- deeppavlov/core/models/tf_backend.py | 16 +- .../dataset_iterators/siamese_iterator.py | 84 +--------- .../dataset_readers/paraphraser_reader.py | 52 +++--- .../classifiers/keras_classification_model.py | 90 +++++------ .../models/embedders/fasttext_embedder.py | 18 +-- deeppavlov/models/morpho_tagger/cells.py | 82 +++++----- .../models/morpho_tagger/common_tagger.py | 6 +- .../models/morpho_tagger/morpho_tagger.py | 94 +++++------ .../ranking/bilstm_gru_siamese_network.py | 6 +- .../models/ranking/bilstm_siamese_network.py | 23 ++- .../models/ranking/keras_siamese_model.py | 6 +- .../models/ranking/mpm_siamese_network.py | 9 +- .../models/tokenizers/nltk_moses_tokenizer.py | 2 +- deeppavlov/requirements/aiml_skill.txt | 2 +- deeppavlov/requirements/en_core_web_sm.txt | 2 +- deeppavlov/requirements/fasttext.txt | 3 +- deeppavlov/requirements/gensim.txt | 2 +- deeppavlov/requirements/kenlm.txt | 2 +- deeppavlov/requirements/spacy.txt | 2 +- deeppavlov/requirements/spelling.txt | 5 +- deeppavlov/requirements/tf-gpu.txt | 2 +- deeppavlov/requirements/tf-hub.txt | 2 +- deeppavlov/requirements/tf.txt | 2 +- docs/apiref/models/classifiers.rst | 14 +- docs/conf.py | 2 +- docs/features/models/ner.rst | 2 +- requirements.txt | 23 ++- ...hrase_ident_paraphraser_interact_test.json | 153 ------------------ .../paraphrase_ident_paraphraser_test.json | 140 ---------------- ...phrase_ident_qqp_bilstm_interact_test.json | 149 ----------------- .../paraphrase_ident_qqp_bilstm_test.json | 137 ---------------- .../paraphrase_ident_qqp_interact_test.json | 149 ----------------- .../ranking/paraphrase_ident_qqp_test.json | 137 ---------------- .../ranking_insurance_interact_test.json | 152 ----------------- .../ranking/ranking_insurance_test.json | 139 ---------------- ...king_ubuntu_v2_bert_sep_interact_test.json | 95 ----------- .../ranking_ubuntu_v2_bert_sep_test.json | 76 --------- .../ranking_ubuntu_v2_bert_uncased_test.json | 76 --------- .../ranking_ubuntu_v2_mt_interact_test.json | 148 ----------------- .../ranking/ranking_ubuntu_v2_mt_test.json | 133 --------------- .../ranking/ranking_ubuntu_v2_test.json | 131 --------------- tests/test_quick_start.py | 28 ++-- 47 files changed, 261 insertions(+), 2266 deletions(-) delete mode 100644 tests/test_configs/ranking/paraphrase_ident_paraphraser_interact_test.json delete mode 100644 tests/test_configs/ranking/paraphrase_ident_paraphraser_test.json delete mode 100644 tests/test_configs/ranking/paraphrase_ident_qqp_bilstm_interact_test.json delete mode 100644 tests/test_configs/ranking/paraphrase_ident_qqp_bilstm_test.json delete mode 100644 tests/test_configs/ranking/paraphrase_ident_qqp_interact_test.json delete mode 100644 tests/test_configs/ranking/paraphrase_ident_qqp_test.json delete mode 100644 tests/test_configs/ranking/ranking_insurance_interact_test.json delete mode 100644 tests/test_configs/ranking/ranking_insurance_test.json delete mode 100644 tests/test_configs/ranking/ranking_ubuntu_v2_bert_sep_interact_test.json delete mode 100644 tests/test_configs/ranking/ranking_ubuntu_v2_bert_sep_test.json delete mode 100644 tests/test_configs/ranking/ranking_ubuntu_v2_bert_uncased_test.json delete mode 100644 tests/test_configs/ranking/ranking_ubuntu_v2_mt_interact_test.json delete mode 100644 tests/test_configs/ranking/ranking_ubuntu_v2_mt_test.json delete mode 100644 tests/test_configs/ranking/ranking_ubuntu_v2_test.json diff --git a/deeppavlov/configs/ranking/ranking_insurance.json b/deeppavlov/configs/ranking/ranking_insurance.json index 4dcd2bc27d..56d5158983 100644 --- a/deeppavlov/configs/ranking/ranking_insurance.json +++ b/deeppavlov/configs/ranking/ranking_insurance.json @@ -5,8 +5,6 @@ }, "dataset_iterator": { "class_name": "siamese_iterator", - "random_batches": true, - "batches_per_epoch": 72, "seed": 243 }, "chainer": { @@ -18,6 +16,7 @@ "class_name": "siamese_preprocessor", "use_matrix": false, "num_ranking_samples": 500, + "pytest_num_ranking_samples": 2, "max_sequence_length": 200, "fit_on": ["x"], "in": ["x"], diff --git a/deeppavlov/configs/ranking/ranking_insurance_interact.json b/deeppavlov/configs/ranking/ranking_insurance_interact.json index 6e160f17de..4c1e2f9671 100644 --- a/deeppavlov/configs/ranking/ranking_insurance_interact.json +++ b/deeppavlov/configs/ranking/ranking_insurance_interact.json @@ -5,8 +5,6 @@ }, "dataset_iterator": { "class_name": "siamese_iterator", - "random_batches": true, - "batches_per_epoch": 72, "seed": 243 }, "chainer": { @@ -18,6 +16,7 @@ "class_name": "siamese_preprocessor", "use_matrix": false, "num_ranking_samples": 500, + "pytest_num_ranking_samples": 2, "max_sequence_length": 200, "fit_on": ["x"], "in": ["x"], @@ -69,7 +68,8 @@ "batch_size": 256, "save_path": "{MODELS_PATH}/insurance_model/model_weights.h5", "load_path": "{MODELS_PATH}/insurance_model/model_weights.h5", - "preprocess": "#preproc.__call__" + "preprocess": "#preproc.__call__", + "pytest_interact_pred_num": 3 }, { "in": ["x_proc"], diff --git a/deeppavlov/core/common/chainer.py b/deeppavlov/core/common/chainer.py index 8aef21dc6c..b3f78d13e3 100644 --- a/deeppavlov/core/common/chainer.py +++ b/deeppavlov/core/common/chainer.py @@ -225,9 +225,9 @@ def _compute(*args, param_names, pipe, targets): for (in_keys, in_params), out_params, component in pipe: x = [mem[k] for k in in_params] if in_keys: - res = component(**dict(zip(in_keys, x))) + res = component.__call__(**dict(zip(in_keys, x))) else: - res = component(*x) + res = component.__call__(*x) if len(out_params) == 1: mem[out_params[0]] = res else: diff --git a/deeppavlov/core/layers/keras_layers.py b/deeppavlov/core/layers/keras_layers.py index 7c1d379fee..29635537c6 100644 --- a/deeppavlov/core/layers/keras_layers.py +++ b/deeppavlov/core/layers/keras_layers.py @@ -12,11 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras import backend as K -from keras.activations import softmax -from keras.engine.topology import Layer -from keras.layers import Dense, Reshape, Concatenate, Lambda -from keras.layers.merge import Multiply +from tensorflow.keras import backend as K +from tensorflow.keras.activations import softmax +from tensorflow.keras.layers import Dense, Reshape, Concatenate, Lambda, Layer, Multiply def expand_tile(units, axis): @@ -102,11 +100,11 @@ def multiplicative_self_attention(units, n_hidden=None, n_output_features=None, return output -class FullMatchingLayer(Layer): - +class MatchingLayer(Layer): def __init__(self, output_dim, **kwargs): self.output_dim = output_dim - super(FullMatchingLayer, self).__init__(**kwargs) + self.W = [] + super().__init__(**kwargs) def build(self, input_shape): assert isinstance(input_shape, list) @@ -116,9 +114,17 @@ def build(self, input_shape): shape=(1, input_shape[0][-1]), initializer='uniform', trainable=True)) - super(FullMatchingLayer, self).build(input_shape) # Be sure to call this at the end + super().build(input_shape) # Be sure to call this at the end + + def compute_output_shape(self, input_shape): + assert isinstance(input_shape, list) + shape_a, shape_b = input_shape + return [(shape_a[0], shape_a[1], self.output_dim), (shape_a[0], shape_a[1], self.output_dim)] + + +class FullMatchingLayer(MatchingLayer): - def call(self, x): + def call(self, x, **kwargs): assert isinstance(x, list) inp_a, inp_b = x last_state = K.expand_dims(inp_b[:, -1, :], 1) @@ -136,29 +142,10 @@ def call(self, x): persp = m[0] return [persp, persp] - def compute_output_shape(self, input_shape): - assert isinstance(input_shape, list) - shape_a, shape_b = input_shape - return [(shape_a[0], shape_a[1], self.output_dim), (shape_a[0], shape_a[1], self.output_dim)] +class MaxpoolingMatchingLayer(MatchingLayer): -class MaxpoolingMatchingLayer(Layer): - - def __init__(self, output_dim, **kwargs): - self.output_dim = output_dim - super(MaxpoolingMatchingLayer, self).__init__(**kwargs) - - def build(self, input_shape): - assert isinstance(input_shape, list) - self.W = [] - for i in range(self.output_dim): - self.W.append(self.add_weight(name='kernel', - shape=(1, input_shape[0][-1]), - initializer='uniform', - trainable=True)) - super(MaxpoolingMatchingLayer, self).build(input_shape) # Be sure to call this at the end - - def call(self, x): + def call(self, x, **kwargs): assert isinstance(x, list) inp_a, inp_b = x m = [] @@ -176,29 +163,10 @@ def call(self, x): persp = m[0] return [persp, persp] - def compute_output_shape(self, input_shape): - assert isinstance(input_shape, list) - shape_a, shape_b = input_shape - return [(shape_a[0], shape_a[1], self.output_dim), (shape_a[0], shape_a[1], self.output_dim)] - -class AttentiveMatchingLayer(Layer): +class AttentiveMatchingLayer(MatchingLayer): - def __init__(self, output_dim, **kwargs): - self.output_dim = output_dim - super(AttentiveMatchingLayer, self).__init__(**kwargs) - - def build(self, input_shape): - assert isinstance(input_shape, list) - self.W = [] - for i in range(self.output_dim): - self.W.append(self.add_weight(name='kernel', - shape=(1, input_shape[0][-1]), - initializer='uniform', - trainable=True)) - super(AttentiveMatchingLayer, self).build(input_shape) # Be sure to call this at the end - - def call(self, x): + def call(self, x, **kwargs): assert isinstance(x, list) inp_a, inp_b = x @@ -224,29 +192,10 @@ def call(self, x): persp = m[0] return [persp, persp] - def compute_output_shape(self, input_shape): - assert isinstance(input_shape, list) - shape_a, shape_b = input_shape - return [(shape_a[0], shape_a[1], self.output_dim), (shape_a[0], shape_a[1], self.output_dim)] - - -class MaxattentiveMatchingLayer(Layer): - def __init__(self, output_dim, **kwargs): - self.output_dim = output_dim - super(MaxattentiveMatchingLayer, self).__init__(**kwargs) - - def build(self, input_shape): - assert isinstance(input_shape, list) - self.W = [] - for i in range(self.output_dim): - self.W.append(self.add_weight(name='kernel', - shape=(1, input_shape[0][-1]), - initializer='uniform', - trainable=True)) - super(MaxattentiveMatchingLayer, self).build(input_shape) # Be sure to call this at the end +class MaxattentiveMatchingLayer(MatchingLayer): - def call(self, x): + def call(self, x, **kwargs): assert isinstance(x, list) inp_a, inp_b = x @@ -272,8 +221,3 @@ def call(self, x): else: persp = m[0] return [persp, persp] - - def compute_output_shape(self, input_shape): - assert isinstance(input_shape, list) - shape_a, shape_b = input_shape - return [(shape_a[0], shape_a[1], self.output_dim), (shape_a[0], shape_a[1], self.output_dim)] diff --git a/deeppavlov/core/models/keras_model.py b/deeppavlov/core/models/keras_model.py index 7c4bedf276..a60f561a1b 100644 --- a/deeppavlov/core/models/keras_model.py +++ b/deeppavlov/core/models/keras_model.py @@ -15,8 +15,8 @@ from abc import abstractmethod from logging import getLogger -import tensorflow as tf -from keras import backend as K +import tensorflow.compat.v1 as tf +from tensorflow.keras import backend as K from overrides import overrides from deeppavlov.core.models.lr_scheduled_model import LRScheduledModel @@ -42,7 +42,7 @@ def __init__(self, **kwargs) -> None: Initialize model using keyword parameters Args: - kwargs (dict): Dictionary with model parameters + kwargs: Dictionary with model parameters """ self.epochs_done = 0 self.batches_seen = 0 @@ -104,19 +104,15 @@ def __init__(self, **kwargs): Args: **kwargs: dictionary of parameters """ - if isinstance(kwargs.get("learning_rate"), float) and isinstance(kwargs.get("learning_rate_decay"), float): - KerasModel.__init__(self, **kwargs) - else: - KerasModel.__init__(self, **kwargs) + self.opt = kwargs + KerasModel.__init__(self, **kwargs) + if not(isinstance(kwargs.get("learning_rate"), float) and isinstance(kwargs.get("learning_rate_decay"), float)): LRScheduledModel.__init__(self, **kwargs) @abstractmethod def get_optimizer(self): """ - Return instance of keras optimizer - - Args: - None + Return an instance of keras optimizer """ pass diff --git a/deeppavlov/core/models/tf_backend.py b/deeppavlov/core/models/tf_backend.py index e52f59b68d..f6d9bc018c 100644 --- a/deeppavlov/core/models/tf_backend.py +++ b/deeppavlov/core/models/tf_backend.py @@ -15,7 +15,7 @@ from abc import ABCMeta from functools import wraps -import tensorflow as tf +import tensorflow.compat.v1 as tf from six import with_metaclass @@ -30,14 +30,12 @@ def _wrapped(*args, **kwargs): return _wrapped -def _keras_wrap(func, graph, session): +def _keras_wrap(func, session): """Constructs function encapsulated in the graph and the session.""" - import keras.backend as K - @wraps(func) def _wrapped(*args, **kwargs): - with graph.as_default(): - K.set_session(session) + with session.graph.as_default(): + tf.keras.backend.set_session(session) return func(*args, **kwargs) return _wrapped @@ -47,10 +45,10 @@ class TfModelMeta(with_metaclass(type, ABCMeta)): """Metaclass that helps all child classes to have their own graph and session.""" def __call__(cls, *args, **kwargs): - obj = cls.__new__(cls) + obj = cls.__new__(cls, *args, **kwargs) from .keras_model import KerasModel if issubclass(cls, KerasModel): - import keras.backend as K + from tensorflow.keras import backend as K if K.backend() != 'tensorflow': obj.__init__(*args, **kwargs) return obj @@ -71,7 +69,7 @@ def __call__(cls, *args, **kwargs): attr = getattr(obj, meth) if callable(attr): if issubclass(cls, KerasModel): - wrapped_attr = _keras_wrap(attr, obj.graph, obj.sess) + wrapped_attr = _keras_wrap(attr, obj.sess) else: wrapped_attr = _graph_wrap(attr, obj.graph) setattr(obj, meth, wrapped_attr) diff --git a/deeppavlov/dataset_iterators/siamese_iterator.py b/deeppavlov/dataset_iterators/siamese_iterator.py index dd418d6532..c111d36053 100644 --- a/deeppavlov/dataset_iterators/siamese_iterator.py +++ b/deeppavlov/dataset_iterators/siamese_iterator.py @@ -23,82 +23,14 @@ @register('siamese_iterator') class SiameseIterator(DataLearningIterator): - """The class contains methods for iterating over a dataset for ranking in training, validation and test mode. + """The class contains methods for iterating over a dataset for ranking in training, validation and test mode.""" - Args: - data: A dictionary containing training, validation and test parts of the dataset obtainable via - ``train``, ``valid`` and ``test`` keys. - seed: Random seed. - shuffle: Whether to shuffle data. - num_samples: A number of data samples to use in ``train``, ``validation`` and ``test`` mode. - random_batches: Whether to choose batches randomly or iterate over data sequentally in training mode. - batches_per_epoch: A number of batches to choose per each epoch in training mode. - Only required if ``random_batches`` is set to ``True``. - """ - - def __init__(self, - data: Dict[str, List], - seed: int = None, - shuffle: bool = False, - num_samples: int = None, - random_batches: bool = False, - batches_per_epoch: int = None, - *args, **kwargs) -> None: - - self.len_valid = kwargs.get("len_valid", 1000) - self.len_test = kwargs.get("len_test", 1000) - super().__init__(data, seed=seed, shuffle=shuffle, *args, **kwargs) - self.random_batches = random_batches - self.batches_per_epoch = batches_per_epoch - self.data["train"] = self.train[:num_samples] - self.data["valid"] = self.valid[:num_samples] - self.data["test"] = self.test[:num_samples] - self.data["all"] = self.train + self.valid + self.test - - def split(self, *args, **kwargs) -> None: - if len(self.valid) == 0 and self.len_valid != 0: + def split(self, *args, len_valid=1000, len_test=1000, **kwargs) -> None: + if len(self.valid) == 0 and len_valid != 0: self.random.shuffle(self.train) - self.valid = self.train[-self.len_valid:] - self.train = self.train[:-self.len_valid] - if len(self.test) == 0: + self.valid = self.train[-len_valid:] + self.train = self.train[:-len_valid] + if len(self.test) == 0 and len_test != 0: self.random.shuffle(self.train) - self.test = self.train[-self.len_test:] - self.train = self.train[:-self.len_test] - - def gen_batches(self, batch_size: int, data_type: str = "train", shuffle: bool = True) -> \ - Tuple[List[List[Tuple[int, int]]], List[int]]: - """Generate batches of inputs and expected outputs to train neural networks. - - Args: - batch_size: number of samples in batch - data_type: can be either 'train', 'test', or 'valid' - shuffle: whether to shuffle dataset before batching - - Yields: - A tuple of a batch of inputs and a batch of expected outputs. - - Inputs and expected outputs have different structure and meaning - depending on class attributes values and ``data_type``. - """ - data = self.data[data_type] - if self.random_batches and self.batches_per_epoch is not None and data_type == "train": - num_steps = self.batches_per_epoch - if batch_size > len(data): - batch_size = len(data) - log.warning("The batch size exceeds the dataset size. Setting it equal to the dataset size.") - else: - num_steps = len(data) // batch_size - if data_type == "train": - if shuffle: - self.random.shuffle(data) - for i in range(num_steps): - if self.random_batches: - context_response_data = self.random.sample(data, k=batch_size) - else: - context_response_data = data[i * batch_size:(i + 1) * batch_size] - yield tuple(zip(*context_response_data)) - if data_type in ["valid", "test"]: - for i in range(num_steps + 1): - context_response_data = data[i * batch_size:(i + 1) * batch_size] - if context_response_data: - yield tuple(zip(*context_response_data)) + self.test = self.train[-len_test:] + self.train = self.train[:-len_test] diff --git a/deeppavlov/dataset_readers/paraphraser_reader.py b/deeppavlov/dataset_readers/paraphraser_reader.py index e73e12985d..db4476bca1 100644 --- a/deeppavlov/dataset_readers/paraphraser_reader.py +++ b/deeppavlov/dataset_readers/paraphraser_reader.py @@ -13,6 +13,7 @@ # limitations under the License. import xml.etree.ElementTree as ET +from pathlib import Path from typing import Dict, List, Tuple from deeppavlov.core.commands.utils import expand_path @@ -30,48 +31,31 @@ class ParaphraserReader(DatasetReader): def read(self, data_path: str, do_lower_case: bool = True, - seed: int = None, *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]: + *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]: """Read the paraphraser.ru dataset from files. Args: data_path: A path to a folder with dataset files. do_lower_case: Do you want to lowercase all texts - seed: Random seed. """ data_path = expand_path(data_path) train_fname = data_path / 'paraphrases.xml' test_fname = data_path / 'paraphrases_gold.xml' - train_data = self.build_data(train_fname, do_lower_case) - test_data = self.build_data(test_fname, do_lower_case) - dataset = {"train": train_data, "valid": [], "test": test_data} - return dataset - def build_data(self, fname, do_lower_case): - with open(fname, 'r') as labels_file: - context = ET.iterparse(labels_file, events=("start", "end")) - # turn it into an iterator - context = iter(context) - # get the root element - event, root = next(context) - same_set = set() - questions = [] - labels = [] - for event, elem in context: - if event == "end" and elem.tag == "paraphrase": - question = [] - y = None - for child in elem.iter(): - if child.get('name') == 'text_1': - question.append(child.text.lower() if do_lower_case else child.text) - if child.get('name') == 'text_2': - question.append(child.text.lower() if do_lower_case else child.text) - if child.get('name') == 'class': - y = 1 if int(child.text) >= 0 else 0 - root.clear() - check_string = "\n".join(question) - if check_string not in same_set: - same_set.add(check_string) - questions.append(question) - labels.append(y) - return list(zip(questions, labels)) + train_data = self._build_data(train_fname, do_lower_case) + test_data = self._build_data(test_fname, do_lower_case) + return {"train": train_data, "valid": [], "test": test_data} + + @staticmethod + def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]: + root = ET.fromstring(data_path.read_text(encoding='utf8')) + data = {} + for paraphrase in root.findall('corpus/paraphrase'): + key = (paraphrase.find('value[@name="text_1"]').text, + paraphrase.find('value[@name="text_2"]').text) + if do_lower_case: + key = tuple([t.lower() for t in key]) + + data[key] = 1 if int(paraphrase.find('value[@name="class"]').text) >= 0 else 0 + return list(data.items()) diff --git a/deeppavlov/models/classifiers/keras_classification_model.py b/deeppavlov/models/classifiers/keras_classification_model.py index 865ef811a9..1bffd9c9ac 100644 --- a/deeppavlov/models/classifiers/keras_classification_model.py +++ b/deeppavlov/models/classifiers/keras_classification_model.py @@ -17,21 +17,15 @@ from pathlib import Path from typing import List, Tuple, Optional, Generator, Union -import keras.metrics -import keras.optimizers import numpy as np -from keras import backend as K -from keras.layers import Dense, Input -from keras.layers import concatenate, Activation, Concatenate, Reshape -from keras.layers.convolutional import Conv1D -from keras.layers.core import Dropout -from keras.layers.normalization import BatchNormalization -from keras.layers.pooling import GlobalMaxPooling1D, MaxPooling1D, GlobalAveragePooling1D -from keras.layers.recurrent import LSTM, GRU -from keras.layers.wrappers import Bidirectional -from keras.models import Model -from keras.regularizers import l2 +import tensorflow.keras from overrides import overrides +from tensorflow.keras import backend as K +from tensorflow.keras.layers import (Conv1D, Dropout, Dense, Input, BatchNormalization, GlobalMaxPooling1D, + MaxPooling1D, concatenate, Activation, Reshape, + GlobalAveragePooling1D, LSTM, GRU, Bidirectional) +from tensorflow.keras.models import Model +from tensorflow.keras.regularizers import l2 from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.file import save_json, read_json @@ -117,6 +111,7 @@ def __init__(self, embedding_size: int, n_classes: int, **kwargs} self.opt = deepcopy(given_opt) self.model = None + self.optimizer = None super().__init__(**given_opt) @@ -148,14 +143,14 @@ def pad_texts(self, sentences: List[List[np.ndarray]]) -> Union[np.ndarray, Tupl array of embedded texts """ pad = np.zeros(self.opt['embedding_size']) - cutted_batch = [sen[:self.opt['text_size']] for sen in sentences] + cut_batch = [sen[:self.opt['text_size']] for sen in sentences] if self.opt["padding"] == "pre": - cutted_batch = [[pad] * (self.opt['text_size'] - len(tokens)) + list(tokens) for tokens in cutted_batch] + cut_batch = [[pad] * (self.opt['text_size'] - len(tokens)) + list(tokens) for tokens in cut_batch] elif self.opt["padding"] == "post": - cutted_batch = [list(tokens) + [pad] * (self.opt['text_size'] - len(tokens)) for tokens in cutted_batch] + cut_batch = [list(tokens) + [pad] * (self.opt['text_size'] - len(tokens)) for tokens in cut_batch] else: raise ConfigError("Padding type {} is not acceptable".format(self.opt['padding'])) - return np.asarray(cutted_batch) + return np.asarray(cut_batch) def check_input(self, texts: List[List[np.ndarray]]) -> np.ndarray: """ @@ -193,43 +188,20 @@ def train_on_batch(self, texts: List[List[np.ndarray]], labels: list) -> Union[f metrics_values = self.model.train_on_batch(features, np.array(labels)) return metrics_values - def infer_on_batch(self, texts: List[List[np.ndarray]], labels: list = None) -> \ - Union[float, List[float], np.ndarray]: - """ - Infer the model on the given batch - - Args: - texts: list of tokenized embedded text samples - labels: list of labels - - Returns: - metrics values on the given batch, if labels are given - predictions, otherwise - """ - features = self.check_input(texts) - - if labels: - metrics_values = self.model.test_on_batch(features, np.array(labels)) - return metrics_values - else: - predictions = self.model.predict(features) - return predictions - - def __call__(self, data: List[List[np.ndarray]], *args) -> List[List[float]]: + def __call__(self, data: List[List[np.ndarray]]) -> List[List[float]]: """ Infer on the given data Args: data: list of tokenized text samples - *args: additional arguments Returns: for each sentence: vector of probabilities to belong with each class or list of labels sentence belongs with """ - preds = np.array(self.infer_on_batch(data), dtype="float64").tolist() - return preds + features = self.check_input(data) + return self.model.predict(features) def init_model_from_scratch(self, model_name: str) -> Model: """ @@ -284,8 +256,8 @@ def _load(self, model_name: str) -> None: try: model.load_weights(str(weights_path)) except ValueError: - raise ConfigError( - "Some non-changable parameters of neural network differ from given pre-trained model") + raise ConfigError("Some non-changeable parameters of neural network differ" + " from given pre-trained model") self.model = model @@ -314,7 +286,7 @@ def compile(self, model: Model, optimizer_name: str, loss_name: str, Returns: """ - optimizer_func = getattr(keras.optimizers, optimizer_name, None) + optimizer_func = getattr(tensorflow.keras.optimizers, optimizer_name, None) if callable(optimizer_func): if isinstance(learning_rate, float) and isinstance(learning_rate_decay, float): # in this case decay will be either given in config or, by default, learning_rate_decay=0. @@ -322,9 +294,9 @@ def compile(self, model: Model, optimizer_name: str, loss_name: str, else: self.optimizer = optimizer_func() else: - raise AttributeError("Optimizer {} is not defined in `keras.optimizers`".format(optimizer_name)) + raise AttributeError("Optimizer {} is not defined in `tensorflow.keras.optimizers`".format(optimizer_name)) - loss_func = getattr(keras.losses, loss_name, None) + loss_func = getattr(tensorflow.keras.losses, loss_name, None) if callable(loss_func): loss = loss_func else: @@ -378,15 +350,15 @@ def save(self, fname: str = None) -> None: # then change load_path to save_path for config self.opt["epochs_done"] = self.epochs_done if isinstance(self.opt.get("learning_rate", None), float): - self.opt["final_learning_rate"] = K.eval(self.optimizer.lr) / (1. + - K.eval( - self.optimizer.decay) * self.batches_seen) + self.opt["final_learning_rate"] = (K.eval(self.optimizer.lr) / + (1. + K.eval(self.optimizer.decay) * self.batches_seen)) if self.opt.get("load_path") and self.opt.get("save_path"): if self.opt.get("save_path") != self.opt.get("load_path"): self.opt["load_path"] = str(self.opt["save_path"]) save_json(self.opt, opt_path) + # noinspection PyUnusedLocal def cnn_model(self, kernel_sizes_cnn: List[int], filters_cnn: int, dense_size: int, coef_reg_cnn: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., input_projection_size: Optional[int] = None, **kwargs) -> Model: @@ -440,6 +412,7 @@ def cnn_model(self, kernel_sizes_cnn: List[int], filters_cnn: int, dense_size: i model = Model(inputs=inp, outputs=act_output) return model + # noinspection PyUnusedLocal def dcnn_model(self, kernel_sizes_cnn: List[int], filters_cnn: List[int], dense_size: int, coef_reg_cnn: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., input_projection_size: Optional[int] = None, **kwargs) -> Model: @@ -490,6 +463,7 @@ def dcnn_model(self, kernel_sizes_cnn: List[int], filters_cnn: List[int], dense_ model = Model(inputs=inp, outputs=act_output) return model + # noinspection PyUnusedLocal def cnn_model_max_and_aver_pool(self, kernel_sizes_cnn: List[int], filters_cnn: int, dense_size: int, coef_reg_cnn: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., input_projection_size: Optional[int] = None, **kwargs) -> Model: @@ -529,7 +503,7 @@ def cnn_model_max_and_aver_pool(self, kernel_sizes_cnn: List[int], filters_cnn: output_i = Activation('relu')(output_i) output_i_0 = GlobalMaxPooling1D()(output_i) output_i_1 = GlobalAveragePooling1D()(output_i) - output_i = Concatenate()([output_i_0, output_i_1]) + output_i = concatenate([output_i_0, output_i_1]) outputs.append(output_i) output = concatenate(outputs, axis=1) @@ -547,6 +521,7 @@ def cnn_model_max_and_aver_pool(self, kernel_sizes_cnn: List[int], filters_cnn: model = Model(inputs=inp, outputs=act_output) return model + # noinspection PyUnusedLocal def bilstm_model(self, units_lstm: int, dense_size: int, coef_reg_lstm: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., rec_dropout_rate: float = 0., @@ -594,6 +569,7 @@ def bilstm_model(self, units_lstm: int, dense_size: int, model = Model(inputs=inp, outputs=act_output) return model + # noinspection PyUnusedLocal def bilstm_bilstm_model(self, units_lstm_1: int, units_lstm_2: int, dense_size: int, coef_reg_lstm: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., rec_dropout_rate: float = 0., @@ -650,6 +626,7 @@ def bilstm_bilstm_model(self, units_lstm_1: int, units_lstm_2: int, dense_size: model = Model(inputs=inp, outputs=act_output) return model + # noinspection PyUnusedLocal def bilstm_cnn_model(self, units_lstm: int, kernel_sizes_cnn: List[int], filters_cnn: int, dense_size: int, coef_reg_lstm: float = 0., coef_reg_cnn: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., rec_dropout_rate: float = 0., @@ -701,7 +678,7 @@ def bilstm_cnn_model(self, units_lstm: int, kernel_sizes_cnn: List[int], filters output_i = GlobalMaxPooling1D()(output_i) outputs.append(output_i) - output = Concatenate(axis=1)(outputs) + output = concatenate(outputs, axis=1) output = Dropout(rate=dropout_rate)(output) output = Dense(dense_size, activation=None, kernel_regularizer=l2(coef_reg_den))(output) @@ -713,6 +690,7 @@ def bilstm_cnn_model(self, units_lstm: int, kernel_sizes_cnn: List[int], filters model = Model(inputs=inp, outputs=act_output) return model + # noinspection PyUnusedLocal def cnn_bilstm_model(self, kernel_sizes_cnn: List[int], filters_cnn: int, units_lstm: int, dense_size: int, coef_reg_cnn: float = 0., coef_reg_lstm: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., rec_dropout_rate: float = 0., @@ -777,6 +755,7 @@ def cnn_bilstm_model(self, kernel_sizes_cnn: List[int], filters_cnn: int, units_ model = Model(inputs=inp, outputs=act_output) return model + # noinspection PyUnusedLocal def bilstm_self_add_attention_model(self, units_lstm: int, dense_size: int, self_att_hid: int, self_att_out: int, coef_reg_lstm: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., rec_dropout_rate: float = 0., @@ -830,6 +809,7 @@ def bilstm_self_add_attention_model(self, units_lstm: int, dense_size: int, self model = Model(inputs=inp, outputs=act_output) return model + # noinspection PyUnusedLocal def bilstm_self_mult_attention_model(self, units_lstm: int, dense_size: int, self_att_hid: int, self_att_out: int, coef_reg_lstm: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., rec_dropout_rate: float = 0., @@ -883,6 +863,7 @@ def bilstm_self_mult_attention_model(self, units_lstm: int, dense_size: int, sel model = Model(inputs=inp, outputs=act_output) return model + # noinspection PyUnusedLocal def bigru_model(self, units_gru: int, dense_size: int, coef_reg_lstm: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., rec_dropout_rate: float = 0., @@ -930,6 +911,7 @@ def bigru_model(self, units_gru: int, dense_size: int, model = Model(inputs=inp, outputs=act_output) return model + # noinspection PyUnusedLocal def bigru_with_max_aver_pool_model(self, units_gru: int, dense_size: int, coef_reg_gru: float = 0., coef_reg_den: float = 0., dropout_rate: float = 0., rec_dropout_rate: float = 0., @@ -964,7 +946,7 @@ def bigru_with_max_aver_pool_model(self, units_gru: int, dense_size: int, output1 = GlobalMaxPooling1D()(output) output2 = GlobalAveragePooling1D()(output) - output = Concatenate()([output1, output2, state1, state2]) + output = concatenate([output1, output2, state1, state2]) output = Dropout(rate=dropout_rate)(output) output = Dense(dense_size, activation=None, diff --git a/deeppavlov/models/embedders/fasttext_embedder.py b/deeppavlov/models/embedders/fasttext_embedder.py index 2962accd45..df4973d9d7 100644 --- a/deeppavlov/models/embedders/fasttext_embedder.py +++ b/deeppavlov/models/embedders/fasttext_embedder.py @@ -15,21 +15,7 @@ from logging import getLogger from typing import Iterator -try: - import fastText -except ModuleNotFoundError as e: - import re - import sys - from pathlib import Path - - ft_req_path = Path(__file__).resolve().parents[2].joinpath('requirements', 'fasttext.txt') - packages = ft_req_path.read_text(encoding='utf8').strip() - packages = re.sub(r'\s+', ' ', packages) - - raise ModuleNotFoundError(f'{e}\n\nYou can install fastText by running\n' - f'{sys.executable} -m pip install {packages}\n' - 'or for your deeppavlov pipeline configuration\n' - f'{sys.executable} -m deeppavlov install ') +import fasttext import numpy as np from overrides import overrides @@ -65,7 +51,7 @@ def load(self) -> None: Load fastText binary model from self.load_path """ log.info(f"[loading fastText embeddings from `{self.load_path}`]") - self.model = fastText.load_model(str(self.load_path)) + self.model = fasttext.load_model(str(self.load_path)) self.dim = self.model.get_dimension() @overrides diff --git a/deeppavlov/models/morpho_tagger/cells.py b/deeppavlov/models/morpho_tagger/cells.py index 1736859c75..f781ba53d6 100644 --- a/deeppavlov/models/morpho_tagger/cells.py +++ b/deeppavlov/models/morpho_tagger/cells.py @@ -12,24 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -import keras.activations as kact -import keras.backend as kb -import keras.initializers as kinit -import keras.layers as kl import numpy as np -from keras.engine.topology import InputSpec +import tensorflow as tf +import tensorflow.keras.backend as K +from tensorflow.keras.initializers import Constant +from tensorflow.keras.layers import InputSpec, Layer, Lambda, Dropout, Multiply INFTY = -100 -class Highway(kl.Layer): +class Highway(Layer): def __init__(self, activation=None, bias_initializer=-1, **kwargs): - super(Highway, self).__init__(**kwargs) - self.activation = kact.get(activation) + super().__init__(**kwargs) + self.activation = tf.keras.activations.get(activation) self.bias_initializer = bias_initializer if isinstance(self.bias_initializer, int): - self.bias_initializer = kinit.constant(self.bias_initializer) + self.bias_initializer = Constant(self.bias_initializer) self.input_spec = [InputSpec(min_ndim=2)] def build(self, input_shape): @@ -48,11 +47,11 @@ def build(self, input_shape): self.built = True def call(self, inputs, **kwargs): - gate = kb.dot(inputs, self.gate_kernel) - gate = kb.bias_add(gate, self.gate_bias, data_format="channels_last") + gate = K.dot(inputs, self.gate_kernel) + gate = K.bias_add(gate, self.gate_bias, data_format="channels_last") gate = self.activation(gate) - new_value = kb.dot(inputs, self.dense_kernel) - new_value = kb.bias_add(new_value, self.dense_bias, data_format="channels_last") + new_value = K.dot(inputs, self.dense_kernel) + new_value = K.bias_add(new_value, self.dense_bias, data_format="channels_last") return gate * new_value + (1.0 - gate) * inputs def compute_output_shape(self, input_shape): @@ -61,13 +60,14 @@ def compute_output_shape(self, input_shape): def weighted_sum(first, second, sigma, first_threshold=-np.inf, second_threshold=np.inf): logit_probs = first * sigma + second * (1.0 - sigma) - infty_tensor = kb.ones_like(logit_probs) * INFTY - logit_probs = kb.switch(kb.greater(first, first_threshold), logit_probs, infty_tensor) - logit_probs = kb.switch(kb.greater(second, second_threshold), logit_probs, infty_tensor) + infty_tensor = K.ones_like(logit_probs) * INFTY + logit_probs = K.switch(K.greater(first, first_threshold), logit_probs, infty_tensor) + logit_probs = K.switch(K.greater(second, second_threshold), logit_probs, infty_tensor) return logit_probs -class WeightedCombinationLayer(kl.Layer): +class WeightedCombinationLayer(Layer): + """ A class for weighted combination of probability distributions """ @@ -85,7 +85,7 @@ def __init__(self, first_threshold=None, second_threshold=None, self.use_dimension_bias = use_dimension_bias self.use_intermediate_layer = use_intermediate_layer self.intermediate_dim = intermediate_dim - self.intermediate_activation = kact.get(intermediate_activation) + self.intermediate_activation = tf.keras.activations.get(intermediate_activation) self.from_logits = from_logits self.return_logits = return_logits self.bias_initializer = bias_initializer @@ -107,7 +107,7 @@ def build(self, input_shape): self.features_kernel = self.add_weight( shape=(features_dim, 1), initializer="random_uniform", name='kernel') self.features_bias = self.add_weight( - shape=(1,), initializer=kinit.Constant(self.bias_initializer), name='bias') + shape=(1,), initializer=Constant(self.bias_initializer), name='bias') if self.use_dimension_bias: self.dimensions_bias = self.add_weight( shape=(input_dim,), initializer="random_uniform", name='dimension_bias') @@ -117,29 +117,29 @@ def call(self, inputs, **kwargs): assert isinstance(inputs, list) and len(inputs) == 3 first, second, features = inputs[0], inputs[1], inputs[2] if not self.from_logits: - first = kb.clip(first, 1e-10, 1.0) - second = kb.clip(second, 1e-10, 1.0) - first_, second_ = kb.log(first), kb.log(second) + first = K.clip(first, 1e-10, 1.0) + second = K.clip(second, 1e-10, 1.0) + first_, second_ = K.log(first), K.log(second) else: first_, second_ = first, second # embedded_features.shape = (M, T, 1) if self.use_intermediate_layer: - features = kb.dot(features, self.first_kernel) - features = kb.bias_add(features, self.first_bias, data_format="channels_last") + features = K.dot(features, self.first_kernel) + features = K.bias_add(features, self.first_bias, data_format="channels_last") features = self.intermediate_activation(features) - embedded_features = kb.dot(features, self.features_kernel) - embedded_features = kb.bias_add( + embedded_features = K.dot(features, self.features_kernel) + embedded_features = K.bias_add( embedded_features, self.features_bias, data_format="channels_last") if self.use_dimension_bias: - tiling_shape = [1] * (kb.ndim(first) - 1) + [kb.shape(first)[-1]] - embedded_features = kb.tile(embedded_features, tiling_shape) - embedded_features = kb.bias_add( + tiling_shape = [1] * (K.ndim(first) - 1) + [K.shape(first)[-1]] + embedded_features = K.tile(embedded_features, tiling_shape) + embedded_features = K.bias_add( embedded_features, self.dimensions_bias, data_format="channels_last") - sigma = kb.sigmoid(embedded_features) + sigma = K.sigmoid(embedded_features) result = weighted_sum(first_, second_, sigma, self.first_threshold, self.second_threshold) - probs = kb.softmax(result) + probs = K.softmax(result) if self.return_logits: return [probs, result] return probs @@ -158,13 +158,13 @@ def TemporalDropout(inputs, dropout=0.0): # TO DO: adapt for >3D tensors if dropout == 0.0: return inputs - inputs_func = lambda x: kb.ones_like(inputs[:, :, 0:1]) - inputs_mask = kl.Lambda(inputs_func)(inputs) - inputs_mask = kl.Dropout(dropout)(inputs_mask) - tiling_shape = [1, 1, kb.shape(inputs)[2]] + [1] * (kb.ndim(inputs) - 3) - inputs_mask = kl.Lambda(kb.tile, arguments={"n": tiling_shape}, - output_shape=inputs._keras_shape[1:])(inputs_mask) - answer = kl.Multiply()([inputs, inputs_mask]) + inputs_func = lambda x: K.ones_like(inputs[:, :, 0:1]) + inputs_mask = Lambda(inputs_func)(inputs) + inputs_mask = Dropout(dropout)(inputs_mask) + tiling_shape = [1, 1, K.shape(inputs)[2]] + [1] * (K.ndim(inputs) - 3) + inputs_mask = Lambda(K.tile, arguments={"n": tiling_shape}, + output_shape=inputs._keras_shape[1:])(inputs_mask) + answer = Multiply()([inputs, inputs_mask]) return answer @@ -174,6 +174,6 @@ def positions_func(inputs, pad=0): 1+ln(1+i) when it contains a meaningful symbol and with 0 when it contains PAD """ - position_inputs = kb.cumsum(kb.ones_like(inputs, dtype="float32"), axis=1) - position_inputs *= kb.cast(kb.not_equal(inputs, pad), "float32") - return kb.log(1.0 + position_inputs) + position_inputs = K.cumsum(K.ones_like(inputs, dtype="float32"), axis=1) + position_inputs *= K.cast(K.not_equal(inputs, pad), "float32") + return K.log(1.0 + position_inputs) \ No newline at end of file diff --git a/deeppavlov/models/morpho_tagger/common_tagger.py b/deeppavlov/models/morpho_tagger/common_tagger.py index 86959d9919..dfc7e330aa 100644 --- a/deeppavlov/models/morpho_tagger/common_tagger.py +++ b/deeppavlov/models/morpho_tagger/common_tagger.py @@ -16,7 +16,7 @@ from typing import Union, Optional, Tuple -import keras.backend as kb +from tensorflow.keras import backend as K import numpy as np EPS = 1e-15 @@ -36,8 +36,8 @@ def to_one_hot(x, k): def repeat_(x, k): - tile_factor = [1, k] + [1] * (kb.ndim(x) - 1) - return kb.tile(x[:, None, :], tile_factor) + tile_factor = [1, k] + [1] * (K.ndim(x) - 1) + return K.tile(x[:, None, :], tile_factor) def make_pos_and_tag(tag: str, sep: str = ",", diff --git a/deeppavlov/models/morpho_tagger/morpho_tagger.py b/deeppavlov/models/morpho_tagger/morpho_tagger.py index ad112cb206..4ebe734f39 100644 --- a/deeppavlov/models/morpho_tagger/morpho_tagger.py +++ b/deeppavlov/models/morpho_tagger/morpho_tagger.py @@ -16,12 +16,13 @@ from pathlib import Path from typing import List, Optional, Union, Tuple -import keras.backend as kb -import keras.layers as kl -import keras.optimizers as ko -import keras.regularizers as kreg import numpy as np -from keras import Model +import tensorflow.keras.backend as K +from tensorflow.keras import Model +from tensorflow.keras.layers import (Input, Dense, Lambda, Concatenate, Conv2D, Dropout, LSTM, Bidirectional, + TimeDistributed) +from tensorflow.keras.optimizers import Nadam +from tensorflow.keras.regularizers import l2 from deeppavlov.core.common.registry import register from deeppavlov.core.data.simple_vocab import SimpleVocabulary @@ -75,7 +76,6 @@ class MorphoTagger(KerasModel): A subclass of :class:`~deeppavlov.core.models.keras_model.KerasModel` """ - def __init__(self, symbols: SimpleVocabulary, tags: SimpleVocabulary, @@ -100,7 +100,7 @@ def __init__(self, regularizer: float = None, verbose: int = 1, **kwargs): # Calls parent constructor. Results in creation of save_folder if it doesn't exist - super().__init__(save_path=save_path, load_path=load_path, mode=mode) + super().__init__(save_path=save_path, load_path=load_path, mode=mode, **kwargs) self.symbols = symbols self.tags = tags self.word_rnn = word_rnn @@ -121,6 +121,7 @@ def __init__(self, self.regularizer = regularizer self.verbose = verbose self._initialize() + self.model_ = None self.build() # Tries to load the model from model `load_path`, if it is available @@ -160,25 +161,25 @@ def _initialize(self): if self.word_vectorizers is None: self.word_vectorizers = [] if self.regularizer is not None: - self.regularizer = kreg.l2(self.regularizer) + self.regularizer = l2(self.regularizer) if self.verbose > 0: log.info("{} symbols, {} tags in CharacterTagger".format(len(self.symbols), len(self.tags))) def build(self): """Builds the network using Keras. """ - word_inputs = kl.Input(shape=(None, MAX_WORD_LENGTH + 2), dtype="int32") + word_inputs = Input(shape=(None, MAX_WORD_LENGTH+2), dtype="int32") inputs = [word_inputs] word_outputs = self._build_word_cnn(word_inputs) if len(self.word_vectorizers) > 0: - additional_word_inputs = [kl.Input(shape=(None, input_dim), dtype="float32") + additional_word_inputs = [Input(shape=(None, input_dim), dtype="float32") for input_dim, dense_dim in self.word_vectorizers] inputs.extend(additional_word_inputs) - additional_word_embeddings = [kl.Dense(dense_dim)(additional_word_inputs[i]) + additional_word_embeddings = [Dense(dense_dim)(additional_word_inputs[i]) for i, (_, dense_dim) in enumerate(self.word_vectorizers)] - word_outputs = kl.Concatenate()([word_outputs] + additional_word_embeddings) + word_outputs = Concatenate()([word_outputs] + additional_word_embeddings) outputs, lstm_outputs = self._build_basic_network(word_outputs) - compile_args = {"optimizer": ko.nadam(lr=0.002, clipnorm=5.0), + compile_args = {"optimizer": Nadam(lr=0.002, clipnorm=5.0), "loss": "categorical_crossentropy", "metrics": ["accuracy"]} self.model_ = Model(inputs, outputs) self.model_.compile(**compile_args) @@ -189,9 +190,9 @@ def build(self): def _build_word_cnn(self, inputs): """Builds word-level network """ - inputs = kl.Lambda(kb.one_hot, arguments={"num_classes": len(self.symbols)}, - output_shape=lambda x: tuple(x) + (len(self.symbols),))(inputs) - char_embeddings = kl.Dense(self.char_embeddings_size, use_bias=False)(inputs) + inputs = Lambda(K.one_hot, arguments={"num_classes": len(self.symbols)}, + output_shape=lambda x: tuple(x) + (len(self.symbols),))(inputs) + char_embeddings = Dense(self.char_embeddings_size, use_bias=False)(inputs) conv_outputs = [] self.char_output_dim_ = 0 for window_size, filters_number in zip(self.char_window_size, self.char_filters): @@ -199,27 +200,27 @@ def _build_word_cnn(self, inputs): curr_filters_number = (min(self.char_filter_multiple * window_size, 200) if filters_number is None else filters_number) for _ in range(self.char_conv_layers - 1): - curr_output = kl.Conv2D(curr_filters_number, (1, window_size), - padding="same", activation="relu", - data_format="channels_last")(curr_output) + curr_output = Conv2D(curr_filters_number, (1, window_size), + padding="same", activation="relu", + data_format="channels_last")(curr_output) if self.conv_dropout > 0.0: - curr_output = kl.Dropout(self.conv_dropout)(curr_output) - curr_output = kl.Conv2D(curr_filters_number, (1, window_size), - padding="same", activation="relu", - data_format="channels_last")(curr_output) + curr_output = Dropout(self.conv_dropout)(curr_output) + curr_output = Conv2D(curr_filters_number, (1, window_size), + padding="same", activation="relu", + data_format="channels_last")(curr_output) conv_outputs.append(curr_output) self.char_output_dim_ += curr_filters_number if len(conv_outputs) > 1: - conv_output = kl.Concatenate(axis=-1)(conv_outputs) + conv_output = Concatenate(axis=-1)(conv_outputs) else: conv_output = conv_outputs[0] - highway_input = kl.Lambda(kb.max, arguments={"axis": -2})(conv_output) + highway_input = Lambda(K.max, arguments={"axis": -2})(conv_output) if self.intermediate_dropout > 0.0: - highway_input = kl.Dropout(self.intermediate_dropout)(highway_input) + highway_input = Dropout(self.intermediate_dropout)(highway_input) for i in range(self.char_highway_layers - 1): highway_input = Highway(activation="relu")(highway_input) if self.highway_dropout > 0.0: - highway_input = kl.Dropout(self.highway_dropout)(highway_input) + highway_input = Dropout(self.highway_dropout)(highway_input) highway_output = Highway(activation="relu")(highway_input) return highway_output @@ -229,22 +230,23 @@ def _build_basic_network(self, word_outputs): transforming word embeddings to intermediate outputs """ if self.word_dropout > 0.0: - lstm_outputs = kl.Dropout(self.word_dropout)(word_outputs) + lstm_outputs = Dropout(self.word_dropout)(word_outputs) else: lstm_outputs = word_outputs - for j in range(self.word_lstm_layers - 1): - lstm_outputs = kl.Bidirectional( - kl.LSTM(self.word_lstm_units[j], return_sequences=True, - dropout=self.lstm_dropout))(lstm_outputs) - lstm_outputs = kl.Bidirectional( - kl.LSTM(self.word_lstm_units[-1], return_sequences=True, - dropout=self.lstm_dropout))(lstm_outputs) - pre_outputs = kl.TimeDistributed( - kl.Dense(len(self.tags), activation="softmax", - activity_regularizer=self.regularizer), - name="p")(lstm_outputs) + for j in range(self.word_lstm_layers-1): + lstm_outputs = Bidirectional( + LSTM(self.word_lstm_units[j], return_sequences=True, + dropout=self.lstm_dropout))(lstm_outputs) + lstm_outputs = Bidirectional( + LSTM(self.word_lstm_units[-1], return_sequences=True, + dropout=self.lstm_dropout))(lstm_outputs) + pre_outputs = TimeDistributed( + Dense(len(self.tags), activation="softmax", + activity_regularizer=self.regularizer), + name="p")(lstm_outputs) return pre_outputs, lstm_outputs + # noinspection PyPep8Naming def _transform_batch(self, data, labels=None, transform_to_one_hot=True): data, additional_data = data[0], data[1:] L = max(len(x) for x in data) @@ -271,9 +273,11 @@ def train_on_batch(self, *args) -> None: # data: a batch of word sequences # labels: a batch of correct tag sequences *data, labels = args + # noinspection PyPep8Naming X, Y = self._transform_batch(data, labels) self.model_.train_on_batch(X, Y) + # noinspection PyPep8Naming def predict_on_batch(self, data: Union[List[np.ndarray], Tuple[np.ndarray]], return_indexes: bool = False) -> List[List[str]]: """ @@ -292,13 +296,13 @@ def predict_on_batch(self, data: Union[List[np.ndarray], Tuple[np.ndarray]], objects_number, lengths = len(X[0]), [len(elem) for elem in data[0]] Y = self.model_.predict_on_batch(X) labels = np.argmax(Y, axis=-1) - answer: List[List[str]] = [None] * objects_number + answer: List[Optional[List[str]]] = [None] * objects_number for i, (elem, length) in enumerate(zip(labels, lengths)): elem = elem[:length] answer[i] = elem if return_indexes else self.tags.idxs2toks(elem) return answer - def __call__(self, *x_batch, **kwargs) -> Union[List, np.ndarray]: + def __call__(self, *x_batch: np.ndarray, **kwargs) -> Union[List, np.ndarray]: """ Predicts answers on batch elements. @@ -322,14 +326,14 @@ def _make_sent_vector(self, sent: List, bucket_length: int = None) -> np.ndarray in j-th word of i-th input sentence. """ bucket_length = bucket_length or len(sent) - answer = np.zeros(shape=(bucket_length, MAX_WORD_LENGTH + 2), dtype=np.int32) + answer = np.zeros(shape=(bucket_length, MAX_WORD_LENGTH+2), dtype=np.int32) for i, word in enumerate(sent): answer[i, 0] = self.tags["BEGIN"] m = min(len(word), MAX_WORD_LENGTH) for j, x in enumerate(word[-m:]): - answer[i, j + 1] = self.symbols[x] - answer[i, m + 1] = self.tags["END"] - answer[i, m + 2:] = self.tags["PAD"] + answer[i, j+1] = self.symbols[x] + answer[i, m+1] = self.tags["END"] + answer[i, m+2:] = self.tags["PAD"] return answer def _make_tags_vector(self, tags, bucket_length=None) -> np.ndarray: diff --git a/deeppavlov/models/ranking/bilstm_gru_siamese_network.py b/deeppavlov/models/ranking/bilstm_gru_siamese_network.py index 281633db0a..fe243598a9 100644 --- a/deeppavlov/models/ranking/bilstm_gru_siamese_network.py +++ b/deeppavlov/models/ranking/bilstm_gru_siamese_network.py @@ -14,9 +14,9 @@ from logging import getLogger -from keras import backend as K -from keras.layers import Input, GlobalMaxPooling1D, Lambda, Dense, GRU -from keras.models import Model +from tensorflow.keras import backend as K +from tensorflow.keras.layers import Input, GlobalMaxPooling1D, Lambda, Dense, GRU +from tensorflow.keras.models import Model from deeppavlov.core.common.registry import register from deeppavlov.models.ranking.bilstm_siamese_network import BiLSTMSiameseNetwork diff --git a/deeppavlov/models/ranking/bilstm_siamese_network.py b/deeppavlov/models/ranking/bilstm_siamese_network.py index d192b9e9b1..547fe746e9 100644 --- a/deeppavlov/models/ranking/bilstm_siamese_network.py +++ b/deeppavlov/models/ranking/bilstm_siamese_network.py @@ -16,14 +16,13 @@ from typing import List import numpy as np -from keras import backend as K -from keras import losses -from keras.initializers import glorot_uniform, Orthogonal -from keras.layers import Input, LSTM, Embedding, GlobalMaxPooling1D, Lambda, Dense, Layer -from keras.layers.merge import Multiply -from keras.layers.wrappers import Bidirectional -from keras.models import Model -from keras.optimizers import Adam +from tensorflow.keras import backend as K +from tensorflow.keras import losses +from tensorflow.keras.initializers import glorot_uniform, Orthogonal +from tensorflow.keras.layers import (Input, LSTM, Embedding, GlobalMaxPooling1D, Lambda, Dense, Layer, Multiply, + Bidirectional) +from tensorflow.keras.models import Model +from tensorflow.keras.optimizers import Adam from tensorflow.python.framework.ops import Tensor from deeppavlov.core.common.registry import register @@ -195,9 +194,9 @@ def _euclidian_dist(self, x_pair: List[Tensor]) -> Tensor: x2_norm = K.l2_normalize(x_pair[1], axis=1) diff = x1_norm - x2_norm square = K.square(diff) - sum = K.sum(square, axis=1) - sum = K.clip(sum, min_value=1e-12, max_value=None) - dist = K.sqrt(sum) / 2. + _sum = K.sum(square, axis=1) + _sum = K.clip(_sum, min_value=1e-12, max_value=None) + dist = K.sqrt(_sum) / 2. return dist def _pairwise_distances(self, inputs: List[Tensor]) -> Tensor: @@ -207,7 +206,7 @@ def _pairwise_distances(self, inputs: List[Tensor]) -> Tensor: dot_product = K.dot(embeddings, K.transpose(embeddings)) square_norm = K.batch_dot(embeddings, embeddings, axes=1) distances = K.transpose(square_norm) - 2.0 * dot_product + square_norm - distances = K.slice(distances, (0, bs), (bs, bs)) + distances = distances[0:bs, bs:bs+bs] distances = K.clip(distances, 0.0, None) mask = K.cast(K.equal(distances, 0.0), K.dtype(distances)) distances = distances + mask * 1e-16 diff --git a/deeppavlov/models/ranking/keras_siamese_model.py b/deeppavlov/models/ranking/keras_siamese_model.py index 545f3469ab..a69365960e 100644 --- a/deeppavlov/models/ranking/keras_siamese_model.py +++ b/deeppavlov/models/ranking/keras_siamese_model.py @@ -17,9 +17,9 @@ from typing import List import numpy as np -from keras import losses -from keras.models import Model -from keras.optimizers import Adam +from tensorflow.keras import losses +from tensorflow.keras.models import Model +from tensorflow.keras.optimizers import Adam from deeppavlov.core.models.keras_model import KerasModel from deeppavlov.models.ranking.siamese_model import SiameseModel diff --git a/deeppavlov/models/ranking/mpm_siamese_network.py b/deeppavlov/models/ranking/mpm_siamese_network.py index e629372e03..cccc26f508 100644 --- a/deeppavlov/models/ranking/mpm_siamese_network.py +++ b/deeppavlov/models/ranking/mpm_siamese_network.py @@ -15,11 +15,10 @@ from logging import getLogger -from keras import backend as K -from keras.initializers import glorot_uniform, Orthogonal -from keras.layers import Input, LSTM, Lambda, Dense, Dropout -from keras.layers.wrappers import Bidirectional -from keras.models import Model +from tensorflow.keras import backend as K +from tensorflow.keras.initializers import glorot_uniform, Orthogonal +from tensorflow.keras.layers import Input, LSTM, Lambda, Dense, Dropout, Bidirectional +from tensorflow.keras.models import Model from deeppavlov.core.common.registry import register from deeppavlov.core.layers.keras_layers import AttentiveMatchingLayer, MaxattentiveMatchingLayer diff --git a/deeppavlov/models/tokenizers/nltk_moses_tokenizer.py b/deeppavlov/models/tokenizers/nltk_moses_tokenizer.py index 64c34b7fcf..9ef22cb028 100644 --- a/deeppavlov/models/tokenizers/nltk_moses_tokenizer.py +++ b/deeppavlov/models/tokenizers/nltk_moses_tokenizer.py @@ -13,7 +13,7 @@ # limitations under the License. from typing import Union, List -from nltk.tokenize.moses import MosesDetokenizer, MosesTokenizer +from sacremoses import MosesDetokenizer, MosesTokenizer from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component diff --git a/deeppavlov/requirements/aiml_skill.txt b/deeppavlov/requirements/aiml_skill.txt index 082e9bea14..6a6602091e 100644 --- a/deeppavlov/requirements/aiml_skill.txt +++ b/deeppavlov/requirements/aiml_skill.txt @@ -1 +1 @@ -python-aiml==0.9.1 \ No newline at end of file +python-aiml==0.9.3 \ No newline at end of file diff --git a/deeppavlov/requirements/en_core_web_sm.txt b/deeppavlov/requirements/en_core_web_sm.txt index 3cf354675e..3fb142ab5d 100644 --- a/deeppavlov/requirements/en_core_web_sm.txt +++ b/deeppavlov/requirements/en_core_web_sm.txt @@ -1 +1 @@ -https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0 \ No newline at end of file +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 \ No newline at end of file diff --git a/deeppavlov/requirements/fasttext.txt b/deeppavlov/requirements/fasttext.txt index 896ed329ed..9b0c672164 100644 --- a/deeppavlov/requirements/fasttext.txt +++ b/deeppavlov/requirements/fasttext.txt @@ -1,2 +1 @@ -pybind11==2.2.3 -git+https://github.com/deepmipt/fastText.git#egg=fastText==0.8.22 \ No newline at end of file +fasttext==0.9.1 \ No newline at end of file diff --git a/deeppavlov/requirements/gensim.txt b/deeppavlov/requirements/gensim.txt index 89bc241aef..96d519b5bd 100644 --- a/deeppavlov/requirements/gensim.txt +++ b/deeppavlov/requirements/gensim.txt @@ -1 +1 @@ -gensim==3.7.3 \ No newline at end of file +gensim==3.8.1 \ No newline at end of file diff --git a/deeppavlov/requirements/kenlm.txt b/deeppavlov/requirements/kenlm.txt index c5f77257bf..2210ba6aa5 100644 --- a/deeppavlov/requirements/kenlm.txt +++ b/deeppavlov/requirements/kenlm.txt @@ -1 +1 @@ -git+https://github.com/kpu/kenlm.git@2ad7cb56924cd3c6811c604973f592cb5ef604eb#egg=kenlm \ No newline at end of file +git+https://github.com/kpu/kenlm.git@96d303cfb1a0c21b8f060dbad640d7ab301c019a#egg=kenlm \ No newline at end of file diff --git a/deeppavlov/requirements/spacy.txt b/deeppavlov/requirements/spacy.txt index 08ed616e91..9693ba97a9 100644 --- a/deeppavlov/requirements/spacy.txt +++ b/deeppavlov/requirements/spacy.txt @@ -1 +1 @@ -spacy==2.1.3 \ No newline at end of file +spacy==2.2.3 \ No newline at end of file diff --git a/deeppavlov/requirements/spelling.txt b/deeppavlov/requirements/spelling.txt index 45fc7e17e6..8d4f300c32 100644 --- a/deeppavlov/requirements/spelling.txt +++ b/deeppavlov/requirements/spelling.txt @@ -1,3 +1,4 @@ -lxml==4.3.4 +lxml==4.4.2 python-Levenshtein==0.12.0 -sortedcontainers==2.0.2 \ No newline at end of file +sortedcontainers==2.1.0 +sacremoses==0.0.35 \ No newline at end of file diff --git a/deeppavlov/requirements/tf-gpu.txt b/deeppavlov/requirements/tf-gpu.txt index bfafa1601a..3a4ce9cac1 100644 --- a/deeppavlov/requirements/tf-gpu.txt +++ b/deeppavlov/requirements/tf-gpu.txt @@ -1 +1 @@ -tensorflow-gpu==1.14.0 \ No newline at end of file +tensorflow-gpu==1.15.0 \ No newline at end of file diff --git a/deeppavlov/requirements/tf-hub.txt b/deeppavlov/requirements/tf-hub.txt index b9e22c0609..6c9b9fb164 100644 --- a/deeppavlov/requirements/tf-hub.txt +++ b/deeppavlov/requirements/tf-hub.txt @@ -1 +1 @@ -tensorflow-hub==0.1.1 \ No newline at end of file +tensorflow-hub==0.7.0 \ No newline at end of file diff --git a/deeppavlov/requirements/tf.txt b/deeppavlov/requirements/tf.txt index 4af7a4546f..504887d126 100644 --- a/deeppavlov/requirements/tf.txt +++ b/deeppavlov/requirements/tf.txt @@ -1 +1 @@ -tensorflow==1.14.0 \ No newline at end of file +tensorflow==1.15.0 \ No newline at end of file diff --git a/docs/apiref/models/classifiers.rst b/docs/apiref/models/classifiers.rst index 81bb9f88b3..f11ce63e4f 100644 --- a/docs/apiref/models/classifiers.rst +++ b/docs/apiref/models/classifiers.rst @@ -5,21 +5,9 @@ deeppavlov.models.classifiers :members: .. autoclass:: deeppavlov.models.classifiers.keras_classification_model.KerasClassificationModel + :members: .. automethod:: __call__ - .. automethod:: pad_texts - .. automethod:: train_on_batch - .. automethod:: infer_on_batch - .. automethod:: cnn_model - .. automethod:: dcnn_model - .. automethod:: cnn_model_max_and_aver_pool - .. automethod:: bilstm_model - .. automethod:: bilstm_bilstm_model - .. automethod:: bilstm_cnn_model - .. automethod:: cnn_bilstm_model - .. automethod:: bilstm_self_add_attention_model - .. automethod:: bilstm_self_mult_attention_model - .. automethod:: bigru_model .. autoclass:: deeppavlov.models.classifiers.cos_sim_classifier.CosineSimilarityClassifier :members: diff --git a/docs/conf.py b/docs/conf.py index eec32ec4e5..5b861fe5b5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -191,7 +191,7 @@ # -- Extension configuration ------------------------------------------------- autodoc_mock_imports = ['tensorflow', 'tensorflow_hub', 'fastText', 'nltk', 'gensim', 'kenlm', 'spacy', 'lxml', - 'sortedcontainers', 'russian_tagsets', 'bert_dp', 'aiml', 'rasa'] + 'sortedcontainers', 'russian_tagsets', 'bert_dp', 'aiml', 'rasa', 'fasttext', 'sacremoses'] extlinks = { 'config': (f'https://github.com/deepmipt/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None) diff --git a/docs/features/models/ner.rst b/docs/features/models/ner.rst index 145e21f18c..750ea7789c 100644 --- a/docs/features/models/ner.rst +++ b/docs/features/models/ner.rst @@ -41,7 +41,7 @@ Here is the list of all available configs: +----------------------------------------------------------------------+ +----------+-----------------+------------+------------+ | :config:`ner_ontonotes_bert ` | | En | 400 MB | 800 MB | 88.6 | +----------------------------------------------------------------------+ + +-----------------+------------+------------+ - | :config:`ner_ontonotes ` | | | 331 MB | 7.8 MB | 86.4 | + | :config:`ner_ontonotes ` | | | 331 MB | 7.8 MB | 86.7 | +----------------------------------------------------------------------+--------------------+ +-----------------+------------+------------+ | :config:`ner_conll2003_bert ` | CoNLL-2003 | | 400 MB | 850 MB | **91.7** | +----------------------------------------------------------------------+ + +-----------------+------------+------------+ diff --git a/requirements.txt b/requirements.txt index 0467d71d15..e291f03eb2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,20 @@ aio-pika==6.4.1 -Cython==0.29.12 -fastapi==0.46.0 +Cython==0.29.14 +fastapi==0.47.1 fuzzywuzzy==0.17.0 -h5py==2.9.0 -keras==2.2.4 -nltk==3.2.5 -numpy==1.16.4 -overrides==1.9 -pandas==0.24.2 +h5py==2.10.0 +nltk==3.4.5 +numpy==1.18.0 +overrides==2.7.0 +pandas==0.25.3 pydantic==1.3 pymorphy2==0.8 pymorphy2-dicts-ru -pyopenssl==19.0.0 -pytelegrambotapi==3.6.6 +pyopenssl==19.1.0 +pytelegrambotapi==3.6.7 requests==2.22.0 rusenttokenize==0.0.5 scikit-learn==0.21.2 -scipy==1.3.0 -tqdm==4.32.2 +scipy==1.4.1 +tqdm==4.41.1 uvicorn==0.11.1 \ No newline at end of file diff --git a/tests/test_configs/ranking/paraphrase_ident_paraphraser_interact_test.json b/tests/test_configs/ranking/paraphrase_ident_paraphraser_interact_test.json deleted file mode 100644 index bec5f573c9..0000000000 --- a/tests/test_configs/ranking/paraphrase_ident_paraphraser_interact_test.json +++ /dev/null @@ -1,153 +0,0 @@ -{ - "dataset_reader": { - "class_name": "paraphraser_reader", - "data_path": "{DOWNLOADS_PATH}/paraphraser_data" - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": false, - "max_sequence_length": 28, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/paraphraser_vocabs/sent.dict", - "load_path": "{MODELS_PATH}/paraphraser_vocabs/sent.dict" - }, - "tokenizer": { - "class_name": "nltk_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/paraphraser_vocabs/tok.dict", - "load_path": "{MODELS_PATH}/paraphraser_vocabs/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_ru_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "id": "model", - "class_name": "mpm_nn", - "len_vocab": "#siam_vocab.len", - "use_matrix": "#preproc.use_matrix", - "attention": true, - "max_sequence_length": "#preproc.max_sequence_length", - "emb_matrix": "#embeddings.emb_mat", - "embedding_dim": "#siam_embedder.dim", - "seed": 243, - "hidden_dim": 200, - "learning_rate": 0.001, - "triplet_loss": false, - "batch_size": 256, - "save_path": "{MODELS_PATH}/paraphraser_model/model_weights.h5", - "load_path": "{MODELS_PATH}/paraphraser_model/model_weights.h5", - "preprocess": "#preproc.__call__" - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "siamese_predictor", - "model": "#model", - "ranking": false, - "attention": true, - "batch_size": "#model.batch_size", - "preproc_func": "#preproc.__call__" - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "train_metrics": [ - "f1", - "acc", - "log_loss" - ], - "metrics": [ - "f1", - "acc", - "log_loss" - ], - "validation_patience": 10, - "val_every_n_epochs": 5, - "log_every_n_batches": 12, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/paraphrase_ident_paraphraser.tar.gz", - "subdir": "{MODELS_PATH}" - }, - { - "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip", - "subdir": "{DOWNLOADS_PATH}/paraphraser_data" - }, - { - "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip", - "subdir": "{DOWNLOADS_PATH}/paraphraser_data" - }, - { - "url": "http://files.deeppavlov.ai/embeddings/ft_dummy_ru_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/paraphrase_ident_paraphraser_test.json b/tests/test_configs/ranking/paraphrase_ident_paraphraser_test.json deleted file mode 100644 index c941b3d876..0000000000 --- a/tests/test_configs/ranking/paraphrase_ident_paraphraser_test.json +++ /dev/null @@ -1,140 +0,0 @@ -{ - "dataset_reader": { - "class_name": "paraphraser_reader", - "data_path": "{DOWNLOADS_PATH}/paraphraser_data" - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": false, - "max_sequence_length": 28, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/paraphraser_vocabs/sent.dict", - "load_path": "{MODELS_PATH}/paraphraser_vocabs/sent.dict" - }, - "tokenizer": { - "class_name": "nltk_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/paraphraser_vocabs/tok.dict", - "load_path": "{MODELS_PATH}/paraphraser_vocabs/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_ru_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "mpm_nn", - "len_vocab": "#siam_vocab.len", - "use_matrix": "#preproc.use_matrix", - "attention": true, - "max_sequence_length": "#preproc.max_sequence_length", - "emb_matrix": "#embeddings.emb_mat", - "embedding_dim": "#siam_embedder.dim", - "seed": 243, - "hidden_dim": 200, - "learning_rate": 0.001, - "triplet_loss": false, - "batch_size": 256, - "save_path": "{MODELS_PATH}/paraphraser_model/model_weights.h5", - "load_path": "{MODELS_PATH}/paraphraser_model/model_weights.h5", - "preprocess": "#preproc.__call__" - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "train_metrics": [ - "f1", - "acc", - "log_loss" - ], - "metrics": [ - "f1", - "acc", - "log_loss" - ], - "validation_patience": 10, - "val_every_n_epochs": 5, - "log_every_n_batches": 12, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip", - "subdir": "{DOWNLOADS_PATH}/paraphraser_data" - }, - { - "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip", - "subdir": "{DOWNLOADS_PATH}/paraphraser_data" - }, - { - "url": "http://files.deeppavlov.ai/embeddings/ft_dummy_ru_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/paraphrase_ident_qqp_bilstm_interact_test.json b/tests/test_configs/ranking/paraphrase_ident_qqp_bilstm_interact_test.json deleted file mode 100644 index 54f1a78400..0000000000 --- a/tests/test_configs/ranking/paraphrase_ident_qqp_bilstm_interact_test.json +++ /dev/null @@ -1,149 +0,0 @@ -{ - "dataset_reader": { - "class_name": "qqp_reader", - "data_path": "{DOWNLOADS_PATH}/qqp_data" - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": false, - "max_sequence_length": 28, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/qqp_vocabs/sent.dict", - "load_path": "{MODELS_PATH}/qqp_vocabs/sent.dict" - }, - "tokenizer": { - "class_name": "nltk_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/qqp_vocabs/tok.dict", - "load_path": "{MODELS_PATH}/qqp_vocabs/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "id": "model", - "class_name": "bilstm_nn", - "len_vocab": "#siam_vocab.len", - "use_matrix": "#preproc.use_matrix", - "attention": true, - "max_sequence_length": "#preproc.max_sequence_length", - "emb_matrix": "#embeddings.emb_mat", - "embedding_dim": "#siam_embedder.dim", - "seed": 243, - "hidden_dim": 200, - "learning_rate": 0.001, - "triplet_loss": false, - "batch_size": 256, - "save_path": "{MODELS_PATH}/qqp_model/model_weights.h5", - "load_path": "{MODELS_PATH}/qqp_model/model_weights.h5", - "preprocess": "#preproc.__call__" - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "siamese_predictor", - "model": "#model", - "ranking": false, - "attention": true, - "batch_size": "#model.batch_size", - "preproc_func": "#preproc.__call__" - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "train_metrics": [ - "log_loss", - "acc", - "f1" - ], - "metrics": [ - "log_loss", - "acc", - "f1" - ], - "validation_patience": 10, - "val_every_n_epochs": 1, - "log_every_n_batches": 525, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/paraphrase_ident_qqp_bilstm.tar.gz", - "subdir": "{MODELS_PATH}" - }, - { - "url": "http://files.deeppavlov.ai/datasets/quora_question_pairs.zip", - "subdir": "{DOWNLOADS_PATH}/qqp_data" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/ft_dummy_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/paraphrase_ident_qqp_bilstm_test.json b/tests/test_configs/ranking/paraphrase_ident_qqp_bilstm_test.json deleted file mode 100644 index 78d4f1be54..0000000000 --- a/tests/test_configs/ranking/paraphrase_ident_qqp_bilstm_test.json +++ /dev/null @@ -1,137 +0,0 @@ -{ - "dataset_reader": { - "class_name": "qqp_reader", - "data_path": "{DOWNLOADS_PATH}/qqp_data" - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": false, - "max_sequence_length": 28, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/qqp_vocabs_bilstm/sent.dict", - "load_path": "{MODELS_PATH}/qqp_vocabs_bilstm/sent.dict" - }, - "tokenizer": { - "class_name": "nltk_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/qqp_vocabs_bilstm/tok.dict", - "load_path": "{MODELS_PATH}/qqp_vocabs_bilstm/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "bilstm_nn", - "len_vocab": "#siam_vocab.len", - "use_matrix": "#preproc.use_matrix", - "attention": true, - "max_sequence_length": "#preproc.max_sequence_length", - "emb_matrix": "#embeddings.emb_mat", - "embedding_dim": "#siam_embedder.dim", - "seed": 243, - "hidden_dim": 200, - "learning_rate": 0.001, - "triplet_loss": false, - "batch_size": 256, - "save_path": "{MODELS_PATH}/qqp_model_bilstm/model_weights.h5", - "load_path": "{MODELS_PATH}/qqp_model_bilstm/model_weights.h5", - "preprocess": "#preproc.__call__" - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "metric_optimization": "minimize", - "train_metrics": [ - "log_loss", - "acc", - "f1" - ], - "metrics": [ - "log_loss", - "acc", - "f1" - ], - "validation_patience": 10, - "val_every_n_epochs": 1, - "log_every_n_batches": 525, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/quora_question_pairs.zip", - "subdir": "{DOWNLOADS_PATH}/qqp_data" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/ft_dummy_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/paraphrase_ident_qqp_interact_test.json b/tests/test_configs/ranking/paraphrase_ident_qqp_interact_test.json deleted file mode 100644 index c995617f29..0000000000 --- a/tests/test_configs/ranking/paraphrase_ident_qqp_interact_test.json +++ /dev/null @@ -1,149 +0,0 @@ -{ - "dataset_reader": { - "class_name": "qqp_reader", - "data_path": "{DOWNLOADS_PATH}/qqp_data" - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": false, - "max_sequence_length": 28, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/qqp_vocabs/sent.dict", - "load_path": "{MODELS_PATH}/qqp_vocabs/sent.dict" - }, - "tokenizer": { - "class_name": "nltk_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/qqp_vocabs/tok.dict", - "load_path": "{MODELS_PATH}/qqp_vocabs/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "id": "model", - "class_name": "mpm_nn", - "len_vocab": "#siam_vocab.len", - "use_matrix": "#preproc.use_matrix", - "attention": true, - "max_sequence_length": "#preproc.max_sequence_length", - "emb_matrix": "#embeddings.emb_mat", - "embedding_dim": "#siam_embedder.dim", - "seed": 243, - "hidden_dim": 200, - "learning_rate": 0.001, - "triplet_loss": false, - "batch_size": 256, - "save_path": "{MODELS_PATH}/qqp_model/model_weights.h5", - "load_path": "{MODELS_PATH}/qqp_model/model_weights.h5", - "preprocess": "#preproc.__call__" - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "siamese_predictor", - "model": "#model", - "ranking": false, - "attention": true, - "batch_size": "#model.batch_size", - "preproc_func": "#preproc.__call__" - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "train_metrics": [ - "log_loss", - "acc", - "f1" - ], - "metrics": [ - "log_loss", - "acc", - "f1" - ], - "validation_patience": 10, - "val_every_n_epochs": 1, - "log_every_n_batches": 525, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/paraphrase_ident_qqp.tar.gz", - "subdir": "{MODELS_PATH}" - }, - { - "url": "http://files.deeppavlov.ai/datasets/quora_question_pairs.zip", - "subdir": "{DOWNLOADS_PATH}/qqp_data" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/ft_dummy_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/paraphrase_ident_qqp_test.json b/tests/test_configs/ranking/paraphrase_ident_qqp_test.json deleted file mode 100644 index e274cc2012..0000000000 --- a/tests/test_configs/ranking/paraphrase_ident_qqp_test.json +++ /dev/null @@ -1,137 +0,0 @@ -{ - "dataset_reader": { - "class_name": "qqp_reader", - "data_path": "{DOWNLOADS_PATH}/qqp_data" - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": false, - "max_sequence_length": 28, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/qqp_vocabs/sent.dict", - "load_path": "{MODELS_PATH}/qqp_vocabs/sent.dict" - }, - "tokenizer": { - "class_name": "nltk_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/qqp_vocabs/tok.dict", - "load_path": "{MODELS_PATH}/qqp_vocabs/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "mpm_nn", - "len_vocab": "#siam_vocab.len", - "use_matrix": "#preproc.use_matrix", - "attention": true, - "max_sequence_length": "#preproc.max_sequence_length", - "emb_matrix": "#embeddings.emb_mat", - "embedding_dim": "#siam_embedder.dim", - "seed": 243, - "hidden_dim": 200, - "learning_rate": 0.001, - "triplet_loss": false, - "batch_size": 256, - "save_path": "{MODELS_PATH}/qqp_model/model_weights.h5", - "load_path": "{MODELS_PATH}/qqp_model/model_weights.h5", - "preprocess": "#preproc.__call__" - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "metric_optimization": "minimize", - "train_metrics": [ - "log_loss", - "acc", - "f1" - ], - "metrics": [ - "log_loss", - "acc", - "f1" - ], - "validation_patience": 10, - "val_every_n_epochs": 1, - "log_every_n_batches": 525, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/quora_question_pairs.zip", - "subdir": "{DOWNLOADS_PATH}/qqp_data" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/ft_dummy_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/ranking_insurance_interact_test.json b/tests/test_configs/ranking/ranking_insurance_interact_test.json deleted file mode 100644 index 0b2515120d..0000000000 --- a/tests/test_configs/ranking/ranking_insurance_interact_test.json +++ /dev/null @@ -1,152 +0,0 @@ -{ - "dataset_reader": { - "class_name": "insurance_reader", - "data_path": "{DOWNLOADS_PATH}/insurance_data" - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "random_batches": true, - "batches_per_epoch": 72, - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": false, - "num_ranking_samples": 2, - "max_sequence_length": 200, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/insurance_vocabs/sent.dict", - "load_path": "{MODELS_PATH}/insurance_vocabs/sent.dict" - }, - "tokenizer": { - "class_name": "split_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/insurance_vocabs/tok.dict", - "load_path": "{MODELS_PATH}/insurance_vocabs/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "id": "model", - "class_name": "bilstm_nn", - "len_vocab": "#siam_vocab.len", - "use_matrix": "#preproc.use_matrix", - "max_sequence_length": "#preproc.max_sequence_length", - "emb_matrix": "#embeddings.emb_mat", - "embedding_dim": "#siam_embedder.dim", - "seed": 243, - "reccurent": "bilstm", - "max_pooling": true, - "shared_weights": true, - "hidden_dim": 300, - "learning_rate": 0.001, - "triplet_loss": true, - "hard_triplets": false, - "margin": 0.1, - "batch_size": 256, - "save_path": "{MODELS_PATH}/insurance_model/model_weights.h5", - "load_path": "{MODELS_PATH}/insurance_model/model_weights.h5", - "preprocess": "#preproc.__call__", - "interact_pred_num": 3 - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "siamese_predictor", - "model": "#model", - "batch_size": "#model.batch_size", - "interact_pred_num": 3, - "responses": "#siam_sent_vocab", - "preproc_func": "#preproc.__call__" - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "train_metrics": [], - "metrics": [ - "r@1_insQA", - "rank_response" - ], - "validation_patience": 5, - "val_every_n_epochs": 5, - "log_every_n_batches": 24, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/insurance_ranking.tar.gz", - "subdir": "{MODELS_PATH}" - }, - { - "url": "http://files.deeppavlov.ai/datasets/insuranceQA-master.zip", - "subdir": "{DOWNLOADS_PATH}/insurance_data" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/ft_dummy_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/ranking_insurance_test.json b/tests/test_configs/ranking/ranking_insurance_test.json deleted file mode 100644 index c325922ca7..0000000000 --- a/tests/test_configs/ranking/ranking_insurance_test.json +++ /dev/null @@ -1,139 +0,0 @@ -{ - "dataset_reader": { - "class_name": "insurance_reader", - "data_path": "{DOWNLOADS_PATH}/insurance_data" - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "random_batches": true, - "batches_per_epoch": 72, - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": false, - "num_ranking_samples": 2, - "max_sequence_length": 200, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/insurance_vocabs/sent.dict", - "load_path": "{MODELS_PATH}/insurance_vocabs/sent.dict" - }, - "tokenizer": { - "class_name": "split_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/insurance_vocabs/tok.dict", - "load_path": "{MODELS_PATH}/insurance_vocabs/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "bilstm_nn", - "len_vocab": "#siam_vocab.len", - "use_matrix": "#preproc.use_matrix", - "max_sequence_length": "#preproc.max_sequence_length", - "emb_matrix": "#embeddings.emb_mat", - "embedding_dim": "#siam_embedder.dim", - "seed": 243, - "reccurent": "bilstm", - "max_pooling": true, - "shared_weights": true, - "hidden_dim": 300, - "learning_rate": 0.001, - "triplet_loss": true, - "hard_triplets": false, - "margin": 0.1, - "batch_size": 256, - "save_path": "{MODELS_PATH}/insurance_model/model_weights.h5", - "load_path": "{MODELS_PATH}/insurance_model/model_weights.h5", - "preprocess": "#preproc.__call__", - "interact_pred_num": 3 - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "train_metrics": [], - "metrics": [ - "r@1_insQA", - "rank_response" - ], - "validation_patience": 5, - "val_every_n_epochs": 5, - "log_every_n_batches": 24, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/insuranceQA-master.zip", - "subdir": "{DOWNLOADS_PATH}/insurance_data" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/ft_dummy_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/ranking_ubuntu_v2_bert_sep_interact_test.json b/tests/test_configs/ranking/ranking_ubuntu_v2_bert_sep_interact_test.json deleted file mode 100644 index 78a33bcb88..0000000000 --- a/tests/test_configs/ranking/ranking_ubuntu_v2_bert_sep_interact_test.json +++ /dev/null @@ -1,95 +0,0 @@ -{ - "dataset_reader": { - "class_name": "ubuntu_v2_reader", - "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_small_data", - "positive_samples": true - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "seed": 243 - }, - "chainer": { - "in": ["x"], - "in_y": ["y"], - "pipe": [ - { - "class_name": "response_base_loader", - "id": "loader", - "save_path": "{MODEL_PATH}", - "load_path": "{MODEL_PATH}" - }, - { - "class_name": "bert_sep_ranker_predictor_preprocessor", - "id": "preproc", - "vocab_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/vocab.txt", - "do_lower_case": true, - "max_seq_length": 128, - "resps": "#loader.resps", - "resp_vecs": "#loader.resp_vecs", - "conts": "#loader.conts", - "cont_vecs": "#loader.cont_vecs", - "in": ["x"], - "out": ["bert_features"] - }, - { - "class_name": "bert_sep_ranker_predictor", - "resps": "#loader.resps", - "resp_vecs": "#loader.resp_vecs", - "resp_features": "#preproc.resp_features", - "conts": "#loader.conts", - "cont_vecs": "#loader.cont_vecs", - "cont_features": "#preproc.cont_features", - "interact_mode": 3, - "bert_config_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_config.json", - "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_model.ckpt", - "save_path": "{MODEL_PATH}", - "load_path": "{MODEL_PATH}/model", - "learning_rate": 2e-05, - "in": ["bert_features"], - "in_y": ["y"], - "out": ["predictions"] - } - ], - "out": ["predictions"] - }, - "train": { - "batch_size": 16, - "pytest_max_batches": 2, - "train_metrics": [], - "metrics": ["r@1", "r@2", "r@5"], - "validation_patience": 1, - "val_every_n_batches": -1, - "val_every_n_epochs": 1, - "log_every_n_batches": -1, - "validate_best": true, - "test_best": true, - "tensorboard_log_dir": "{MODEL_PATH}/" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/ubuntu_v2_uncased_bert_sep_predictor_model" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_small_data.tar.gz", - "subdir": "{DOWNLOADS_PATH}" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/uncased_L-12_H-768_A-12.zip", - "subdir": "{DOWNLOADS_PATH}/bert_models" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_bert_sep_predictor_model.tar.gz", - "subdir": "{MODELS_PATH}" - } - - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/ranking_ubuntu_v2_bert_sep_test.json b/tests/test_configs/ranking/ranking_ubuntu_v2_bert_sep_test.json deleted file mode 100644 index bf8d1ff6a8..0000000000 --- a/tests/test_configs/ranking/ranking_ubuntu_v2_bert_sep_test.json +++ /dev/null @@ -1,76 +0,0 @@ -{ - "dataset_reader": { - "class_name": "ubuntu_v2_reader", - "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_small_data", - "positive_samples": true - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "seed": 243 - }, - "chainer": { - "in": ["x"], - "in_y": ["y"], - "pipe": [ - { - "class_name": "bert_sep_ranker_preprocessor", - "vocab_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/vocab.txt", - "do_lower_case": true, - "max_seq_length": 128, - "in": ["x"], - "out": ["bert_features"] - }, - { - "class_name": "bert_sep_ranker", - "bert_config_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_config.json", - "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_model.ckpt", - "save_path": "{MODEL_PATH}/model", - "load_path": "{MODEL_PATH}/model", - "learning_rate": 2e-05, - "in": ["bert_features"], - "in_y": ["y"], - "out": ["predictions"] - } - ], - "out": ["predictions"] - }, - "train": { - "batch_size": 16, - "pytest_max_batches": 2, - "train_metrics": [], - "metrics": ["r@1", "r@2", "r@5"], - "validation_patience": 1, - "val_every_n_batches": -1, - "val_every_n_epochs": 1, - "log_every_n_batches": -1, - "validate_best": true, - "test_best": true, - "tensorboard_log_dir": "{MODEL_PATH}/" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/ubuntu_v2_uncased_bert_sep_model" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_small_data.tar.gz", - "subdir": "{DOWNLOADS_PATH}" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/uncased_L-12_H-768_A-12.zip", - "subdir": "{DOWNLOADS_PATH}/bert_models" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_bert_sep_model.tar.gz", - "subdir": "{MODELS_PATH}" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/ranking_ubuntu_v2_bert_uncased_test.json b/tests/test_configs/ranking/ranking_ubuntu_v2_bert_uncased_test.json deleted file mode 100644 index c55ff8cb9f..0000000000 --- a/tests/test_configs/ranking/ranking_ubuntu_v2_bert_uncased_test.json +++ /dev/null @@ -1,76 +0,0 @@ -{ - "dataset_reader": { - "class_name": "ubuntu_v2_reader", - "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_small_data" - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "seed": 243 - }, - "chainer": { - "in": ["x"], - "in_y": ["y"], - "pipe": [ - { - "class_name": "bert_ranker_preprocessor", - "vocab_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/vocab.txt", - "do_lower_case": true, - "max_seq_length": 128, - "in": ["x"], - "out": ["bert_features"] - }, - { - "class_name": "bert_ranker", - "one_hot_labels": false, - "bert_config_file": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_config.json", - "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/uncased_L-12_H-768_A-12/bert_model.ckpt", - "save_path": "{MODEL_PATH}/model", - "load_path": "{MODEL_PATH}/model", - "learning_rate": 2e-05, - "in": ["bert_features"], - "in_y": ["y"], - "out": ["predictions"] - } - ], - "out": ["predictions"] - }, - "train": { - "batch_size": 32, - "pytest_max_batches": 2, - "train_metrics": [], - "metrics": ["r@1", "r@2", "r@5"], - "validation_patience": 1, - "val_every_n_batches": -1, - "val_every_n_epochs": 1, - "log_every_n_batches": -1, - "validate_best": true, - "test_best": true, - "tensorboard_log_dir": "{MODEL_PATH}/" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/ubuntu_v2_uncased_bert_model" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_small_data.tar.gz", - "subdir": "{DOWNLOADS_PATH}" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/uncased_L-12_H-768_A-12.zip", - "subdir": "{DOWNLOADS_PATH}/bert_models" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_bert_model.tar.gz", - "subdir": "{MODELS_PATH}" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/ranking_ubuntu_v2_mt_interact_test.json b/tests/test_configs/ranking/ranking_ubuntu_v2_mt_interact_test.json deleted file mode 100644 index fd75943707..0000000000 --- a/tests/test_configs/ranking/ranking_ubuntu_v2_mt_interact_test.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "dataset_reader": { - "class_name": "ubuntu_v2_mt_reader", - "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data", - "num_context_turns": 10 - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": true, - "num_ranking_samples": 2, - "num_context_turns": 10, - "max_sequence_length": 50, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/ubuntu_v2_vocabs/sent.dict", - "load_path": "{MODELS_PATH}/ubuntu_v2_vocabs/sent.dict" - }, - "tokenizer": { - "class_name": "split_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/ubuntu_v2_mt_vocabs/tok.dict", - "load_path": "{MODELS_PATH}/ubuntu_v2_mt_vocabs/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "id": "model", - "class_name": "bilstm_gru_nn", - "use_matrix": "#preproc.use_matrix", - "num_context_turns": "#preproc.num_context_turns", - "len_vocab": "#siam_vocab.len", - "max_sequence_length": "#preproc.max_sequence_length", - "embedding_dim": "#siam_embedder.dim", - "emb_matrix": "#embeddings.emb_mat", - "seed": 243, - "hidden_dim": 300, - "learning_rate": 0.001, - "triplet_loss": false, - "batch_size": 256, - "save_path": "{MODELS_PATH}/ubuntu_v2_mt_model/model_weights.h5", - "load_path": "{MODELS_PATH}/ubuntu_v2_mt_model/model_weights.h5" - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "siamese_predictor", - "model": "#model", - "num_context_turns": "#model.num_context_turns", - "batch_size": "#model.batch_size", - "interact_pred_num": 3, - "attention": true, - "responses": "#siam_sent_vocab", - "preproc_func": "#preproc.__call__" - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "train_metrics": [], - "metrics": [ - "r@1", - "rank_response" - ], - "validation_patience": 10, - "val_every_n_epochs": 1, - "log_every_n_batches": 1000, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_mt_ranking.tar.gz", - "subdir": "{MODELS_PATH}" - }, - { - "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz", - "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/ft_dummy_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/ranking_ubuntu_v2_mt_test.json b/tests/test_configs/ranking/ranking_ubuntu_v2_mt_test.json deleted file mode 100644 index d97841acbe..0000000000 --- a/tests/test_configs/ranking/ranking_ubuntu_v2_mt_test.json +++ /dev/null @@ -1,133 +0,0 @@ -{ - "dataset_reader": { - "class_name": "ubuntu_v2_mt_reader", - "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data", - "num_context_turns": 10 - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": true, - "num_ranking_samples": 2, - "num_context_turns": 10, - "max_sequence_length": 50, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/ubuntu_v2_vocabs/sent.dict", - "load_path": "{MODELS_PATH}/ubuntu_v2_vocabs/sent.dict" - }, - "tokenizer": { - "class_name": "split_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/ubuntu_v2_mt_vocabs/tok.dict", - "load_path": "{MODELS_PATH}/ubuntu_v2_mt_vocabs/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "bilstm_gru_nn", - "use_matrix": "#preproc.use_matrix", - "num_context_turns": "#preproc.num_context_turns", - "len_vocab": "#siam_vocab.len", - "max_sequence_length": "#preproc.max_sequence_length", - "embedding_dim": "#siam_embedder.dim", - "emb_matrix": "#embeddings.emb_mat", - "seed": 243, - "hidden_dim": 300, - "learning_rate": 0.001, - "triplet_loss": false, - "batch_size": 256, - "save_path": "{MODELS_PATH}/ubuntu_v2_mt_model/model_weights.h5", - "load_path": "{MODELS_PATH}/ubuntu_v2_mt_model/model_weights.h5" - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "train_metrics": [], - "metrics": [ - "r@1", - "rank_response" - ], - "validation_patience": 10, - "val_every_n_epochs": 1, - "log_every_n_batches": 1000, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz", - "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/ft_dummy_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_configs/ranking/ranking_ubuntu_v2_test.json b/tests/test_configs/ranking/ranking_ubuntu_v2_test.json deleted file mode 100644 index 0021cab2a9..0000000000 --- a/tests/test_configs/ranking/ranking_ubuntu_v2_test.json +++ /dev/null @@ -1,131 +0,0 @@ -{ - "dataset_reader": { - "class_name": "ubuntu_v2_reader", - "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data" - }, - "dataset_iterator": { - "class_name": "siamese_iterator", - "num_samples": 1024, - "seed": 243 - }, - "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y" - ], - "pipe": [ - { - "id": "preproc", - "class_name": "siamese_preprocessor", - "use_matrix": true, - "num_ranking_samples": 2, - "max_sequence_length": 50, - "fit_on": [ - "x" - ], - "in": [ - "x" - ], - "out": [ - "x_proc" - ], - "sent_vocab": { - "id": "siam_sent_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/ubuntu_v2_vocabs/sent.dict", - "load_path": "{MODELS_PATH}/ubuntu_v2_vocabs/sent.dict" - }, - "tokenizer": { - "class_name": "split_tokenizer" - }, - "vocab": { - "id": "siam_vocab", - "class_name": "simple_vocab", - "save_path": "{MODELS_PATH}/ubuntu_v2_vocabs/tok.dict", - "load_path": "{MODELS_PATH}/ubuntu_v2_vocabs/tok.dict" - }, - "embedder": { - "id": "siam_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/ft_dummy_300.bin", - "dim": 300 - } - }, - { - "id": "embeddings", - "class_name": "emb_mat_assembler", - "embedder": "#siam_embedder", - "vocab": "#siam_vocab" - }, - { - "in": [ - "x_proc" - ], - "in_y": [ - "y" - ], - "out": [ - "y_predicted" - ], - "class_name": "bilstm_nn", - "len_vocab": "#siam_vocab.len", - "use_matrix": "#preproc.use_matrix", - "max_sequence_length": "#preproc.max_sequence_length", - "embedding_dim": "#siam_embedder.dim", - "seed": 243, - "hidden_dim": 300, - "emb_matrix": "#embeddings.emb_mat", - "learning_rate": 0.001, - "triplet_loss": false, - "batch_size": 256, - "interact_pred_num": 3, - "save_path": "{MODELS_PATH}/ubuntu_v2_model/model_weights.h5", - "load_path": "{MODELS_PATH}/ubuntu_v2_model/model_weights.h5" - } - ], - "out": [ - "y_predicted" - ] - }, - "train": { - "epochs": 200, - "batch_size": 256, - "pytest_max_batches": 2, - "train_metrics": [], - "metrics": [ - "r@1", - "rank_response" - ], - "validation_patience": 10, - "val_every_n_epochs": 1, - "log_every_n_batches": 1000, - "class_name": "nn_trainer", - "evaluation_targets": [ - "valid", - "test" - ] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" - }, - "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" - ], - "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz", - "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/ft_dummy_300.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index 987f07bff8..2a5d37d1a7 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -161,14 +161,14 @@ ("elmo/elmo_1b_benchmark_test.json", "elmo_1b_benchmark_test", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], }, "ranking": { - ("ranking/ranking_insurance_test.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/ranking_insurance_interact_test.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/ranking_ubuntu_v2_test.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/ranking_insurance.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/ranking_insurance_interact.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/ranking_ubuntu_v2.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("ranking/ranking_ubuntu_v2_interact.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/ranking_ubuntu_v2_mt_test.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/ranking_ubuntu_v2_mt_interact_test.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/paraphrase_ident_paraphraser_test.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/paraphrase_ident_paraphraser_interact_test.json", "ranking", + ("ranking/ranking_ubuntu_v2_mt.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/ranking_ubuntu_v2_mt_interact.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/paraphrase_ident_paraphraser.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/paraphrase_ident_paraphraser_interact.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], ("ranking/paraphrase_ident_paraphraser_pretrain.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("ranking/paraphrase_ident_paraphraser_tune.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], @@ -177,14 +177,14 @@ ("ranking/paraphrase_ident_paraphraser_elmo.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("ranking/paraphrase_ident_elmo_interact.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/paraphrase_ident_qqp_test.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/paraphrase_ident_qqp_bilstm_interact_test.json", "ranking", + ("ranking/paraphrase_ident_qqp.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/paraphrase_ident_qqp_bilstm_interact.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/paraphrase_ident_qqp_bilstm_test.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/paraphrase_ident_qqp_interact_test.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/ranking_ubuntu_v2_bert_uncased_test.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/ranking_ubuntu_v2_bert_sep_test.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], - ("ranking/ranking_ubuntu_v2_bert_sep_interact_test.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/paraphrase_ident_qqp_bilstm.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/paraphrase_ident_qqp_interact.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/ranking_ubuntu_v2_bert_uncased.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/ranking_ubuntu_v2_bert_sep.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("ranking/ranking_ubuntu_v2_bert_sep_interact.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], ("ranking/ranking_ubuntu_v1_mt_word2vec_smn.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("ranking/ranking_ubuntu_v1_mt_word2vec_dam.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("ranking/ranking_ubuntu_v1_mt_word2vec_dam_transformer.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], From b06db0f0ab9028f55333d1f28ff8c33e9754d4a4 Mon Sep 17 00:00:00 2001 From: Aleksei Lymar Date: Wed, 19 Feb 2020 12:30:32 +0300 Subject: [PATCH 09/15] docs: fix faq doc links (#1131) resolves #1128 --- docs/features/skills/faq.rst | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/docs/features/skills/faq.rst b/docs/features/skills/faq.rst index d2b15fcded..9202951d2c 100644 --- a/docs/features/skills/faq.rst +++ b/docs/features/skills/faq.rst @@ -145,33 +145,15 @@ Available Data and Pretrained Models As an example you can try pretrained models on FAQ dataset in English: MIPT FAQ for entrants - https://mipt.ru/english/edu/faqs/ - - :: - - tfidf_logreg_classifier_en_mipt_faq - http://files.deeppavlov.ai/faq/mipt/tfidf_logreg_classifier_en_mipt_faq.pkl - tfidf_vectorizer_en_mipt_faq - http://files.deeppavlov.ai/faq/mipt/tfidf_vectorizer_en_mipt_faq.pkl - - -- **tfidf_logreg_classifier_en_mipt_faq.pkl** - pre-trained logistic regression classifier for classifying input question (vectorized by tfidf) -- **tfidf_vectorizer_en_mipt_faq.pkl** - pre-trained model for TF-IDF vectorizer based on MIPT FAQ +- `tfidf_logreg_classifier_en_mipt_faq.pkl `__ - pre-trained logistic regression classifier for classifying input question (vectorized by tfidf) +- `tfidf_vectorizer_en_mipt_faq.pkl `__ - pre-trained model for TF-IDF vectorizer based on MIPT FAQ Example config - :config:`tfidf_logreg_en_faq.json ` -Also you can use pretrained model on Russan FAQ dataset from school-site: http://www.ftl.name/page/989 - - :: - - tfidf_cos_sim_classifier - http://files.deeppavlov.ai/faq/school/faq_tfidf_cos_model.pkl - tfidf_logreg_classifier - http://files.deeppavlov.ai/faq/school/faq_tfidf_logreg_model.pkl - fasttext_cos_classifier - http://files.deeppavlov.ai/faq/school/faq_fasttext_cos_model.pkl - tfidf_vectorizer_ruwiki - http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki.pkl - - -- **tfidf_cos_sim_classifier.pkl** - pre-trained cosine similarity classifier for classifying input question (vectorized by tfidf) -- **tfidf_logreg_classifier.pkl** - pre-trained logistic regression classifier for classifying input question (vectorized by tfidf) -- **fasttext_cos_classifier.pkl** - pre-trained cosine similarity classifier for classifying input question (vectorized by word embeddings) -- **tfidf_vectorizer_ruwiki.pkl** - pre-trained model for TF-IDF vectorizer based on Russian Wikipedia - - +Also you can use pretrained model on Russan FAQ dataset from school-site: https://gobu.ftl.name/page/1279/ +- `tfidf_cos_sim_classifier.pkl `__ - pre-trained cosine similarity classifier for classifying input question (vectorized by tfidf) +- `tfidf_logreg_classifier_v2.pkl `__ - pre-trained logistic regression classifier for classifying input question (vectorized by tfidf) +- `fasttext_cos_classifier.pkl `__ - pre-trained cosine similarity classifier for classifying input question (vectorized by word embeddings) +- `tfidf_vectorizer_ruwiki_v2.pkl `__ - pre-trained model for TF-IDF vectorizer based on Russian Wikipedia From 70e7fa0d6bb3c8d05067cab396c96e25e7c219f3 Mon Sep 17 00:00:00 2001 From: Aleksei Lymar Date: Thu, 20 Feb 2020 16:17:19 +0300 Subject: [PATCH 10/15] doc: fix faq python usage example code --- docs/features/skills/faq.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/features/skills/faq.rst b/docs/features/skills/faq.rst index 9202951d2c..e1429bc312 100644 --- a/docs/features/skills/faq.rst +++ b/docs/features/skills/faq.rst @@ -18,10 +18,9 @@ Building .. code:: python - from deeppavlov import configs - from deeppavlov.core.commands.infer import build_model + from deeppavlov import build_model, configs - faq = build_model(configs.faq.tfidf_logreg_en_faq, load_trained=True) + faq = build_model(configs.faq.tfidf_logreg_en_faq, download=True) Inference From 46c6ae6cd8a1f16af9e91fae6f4ce77e22fb7d49 Mon Sep 17 00:00:00 2001 From: Pavel Gulyaev Date: Thu, 20 Feb 2020 16:18:16 +0300 Subject: [PATCH 11/15] fix: allow `\t` in vocab entities (#1095) --- deeppavlov/core/data/simple_vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeppavlov/core/data/simple_vocab.py b/deeppavlov/core/data/simple_vocab.py index 73edb5aec9..dd7485d40e 100644 --- a/deeppavlov/core/data/simple_vocab.py +++ b/deeppavlov/core/data/simple_vocab.py @@ -136,7 +136,7 @@ def load_line(self, ln): token = ln.strip().split()[0] cnt = self._min_freq else: - token, cnt = ln.split('\t', 1) + token, cnt = ln.rsplit('\t', 1) return token, cnt @property From ecd73913739da12726f7a76f62a4c0ea15618a8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C4=81rcis=20Gas=C5=ABns?= Date: Fri, 21 Feb 2020 17:43:59 +0300 Subject: [PATCH 12/15] https://universaldependencies.org URL was wrong --- docs/features/models/morphotagger.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/models/morphotagger.rst b/docs/features/models/morphotagger.rst index e1f962d00a..6437055698 100644 --- a/docs/features/models/morphotagger.rst +++ b/docs/features/models/morphotagger.rst @@ -19,7 +19,7 @@ They also achieve the state-of-the-art performance among open source systems. The BERT-based model is trained on `Universal -Dependencies corpora `__ +Dependencies corpora `__ (version 2.3), while all the other models were trained on Universal Dependencies 2.0 corpora. From 8428cfe36bacd10dd0fc51117559c77cf7d02220 Mon Sep 17 00:00:00 2001 From: Aleksei Lymar Date: Tue, 25 Feb 2020 13:29:19 +0300 Subject: [PATCH 13/15] refactor: separate go-bot trackers and network (#1134) * Add launch script * Refactor trackers * Put state dict into tracker class * Add multiple user tracker * Finish add multiple user * Delete class DefaultTracker * Move api call to tracker * Move calc action mask to tracker * Move previous action update to tracker class * Move database control to tracker class * Fix bug in Multiple User tracker * docs: correct tracker classes list in apiref * Fix imports order * Add type specifications to bot modules * Replace os.path with Path * Fix bug for minimal config * chore: remove a debugging launch file Co-authored-by: jen1995 --- deeppavlov/models/go_bot/network.py | 195 ++++++++------------- deeppavlov/models/go_bot/tracker.py | 254 ++++++++++++++++++---------- docs/apiref/models/go_bot.rst | 6 +- 3 files changed, 236 insertions(+), 219 deletions(-) diff --git a/deeppavlov/models/go_bot/network.py b/deeppavlov/models/go_bot/network.py index 9cb4aa37e7..87c222d10d 100644 --- a/deeppavlov/models/go_bot/network.py +++ b/deeppavlov/models/go_bot/network.py @@ -13,7 +13,6 @@ # limitations under the License. import collections -import copy import json import re from logging import getLogger @@ -32,7 +31,7 @@ from deeppavlov.core.layers import tf_layers from deeppavlov.core.models.component import Component from deeppavlov.core.models.tf_model import LRScheduledTFModel -from deeppavlov.models.go_bot.tracker import Tracker +from deeppavlov.models.go_bot.tracker import FeaturizedTracker, DialogueStateTracker, MultipleUserStateTracker log = getLogger(__name__) @@ -117,7 +116,7 @@ class GoalOrientedBot(LRScheduledTFModel): def __init__(self, tokenizer: Component, - tracker: Tracker, + tracker: FeaturizedTracker, template_path: str, save_path: str, hidden_size: int = 128, @@ -151,7 +150,7 @@ def __init__(self, super().__init__(load_path=load_path, save_path=save_path, **kwargs) self.tokenizer = tokenizer - self.default_tracker = tracker + self.bow_embedder = bow_embedder self.embedder = embedder self.slot_filler = slot_filler @@ -167,7 +166,9 @@ def __init__(self, self.n_actions = len(self.templates) log.info(f"{self.n_actions} templates loaded.") - self.database = database + self.default_tracker = tracker + self.dialogue_state_tracker = DialogueStateTracker(tracker.slot_names, self.n_actions, hidden_size, database) + self.api_call_id = -1 if api_call_action is not None: self.api_call_id = self.templates.actions.index(api_call_action) @@ -190,7 +191,7 @@ def __init__(self, new_network_parameters.update(network_parameters) self._init_network(**new_network_parameters) - self.states = {} + self.multiple_user_state_tracker = MultipleUserStateTracker() self.reset() def _init_network(self, @@ -256,7 +257,7 @@ def _init_network(self, def _encode_context(self, tokens: List[str], - state: dict) -> List[np.ndarray]: + tracker: DialogueStateTracker) -> List[np.ndarray]: # Bag of words features bow_features = [] if callable(self.bow_embedder): @@ -300,28 +301,29 @@ def _encode_context(self, attn_key = np.array([], dtype=np.float32) if self.attn: if self.attn.action_as_key: - attn_key = np.hstack((attn_key, state['prev_action'])) + attn_key = np.hstack((attn_key, tracker.prev_action)) if self.attn.intent_as_key: attn_key = np.hstack((attn_key, intent_features)) if len(attn_key) == 0: attn_key = np.array([1], dtype=np.float32) - state_features = state['tracker'].get_features() + state_features = tracker.get_features() # Other features result_matches_state = 0. - if state['db_result'] is not None: - matching_items = state['tracker'].get_state().items() - result_matches_state = all(v == state['db_result'].get(s) + if tracker.db_result is not None: + matching_items = tracker.get_state().items() + result_matches_state = all(v == tracker.db_result.get(s) for s, v in matching_items if v != 'dontcare') * 1. - context_features = np.array([bool(state['current_db_result']) * 1., - (state['current_db_result'] == {}) * 1., - (state['db_result'] is None) * 1., - bool(state['db_result']) * 1., - (state['db_result'] == {}) * 1., - result_matches_state], - dtype=np.float32) + context_features = np.array([ + bool(tracker.current_db_result) * 1., + (tracker.current_db_result == {}) * 1., + (tracker.db_result is None) * 1., + bool(tracker.db_result) * 1., + (tracker.db_result == {}) * 1., + result_matches_state + ], dtype=np.float32) if self.debug: log.debug(f"Context features = {context_features}") @@ -330,27 +332,27 @@ def _encode_context(self, f", num intent features = {intent_features}" + \ f", num state features = {len(state_features)}" + \ f", num context features = {len(context_features)}" + \ - f", prev_action shape = {len(state['prev_action'])}" + f", prev_action shape = {len(tracker.prev_action)}" log.debug(debug_msg) concat_feats = np.hstack((bow_features, emb_features, intent_features, state_features, context_features, - state['prev_action'])) + tracker.prev_action)) return concat_feats, emb_context, attn_key def _encode_response(self, act: str) -> int: return self.templates.actions.index(act) - def _decode_response(self, action_id: int, state: dict) -> str: + def _decode_response(self, action_id: int, tracker: DialogueStateTracker) -> str: """ Convert action template id and entities from tracker to final response. """ template = self.templates.templates[int(action_id)] - slots = state['tracker'].get_state() - if state['db_result'] is not None: - for k, v in state['db_result'].items(): + slots = tracker.get_state() + if tracker.db_result is not None: + for k, v in tracker.db_result.items(): slots[k] = str(v) resp = template.generate_text(slots) @@ -359,55 +361,36 @@ def _decode_response(self, action_id: int, state: dict) -> str: resp = re.sub("#([A-Za-z]+)", "dontcare", resp).lower() return resp - def calc_action_mask(self, state: dict) -> np.ndarray: - mask = np.ones(self.n_actions, dtype=np.float32) - if self.use_action_mask: - known_entities = {**state['tracker'].get_state(), - **(state['db_result'] or {})} - for a_id in range(self.n_actions): - tmpl = str(self.templates.templates[a_id]) - for entity in set(re.findall('#([A-Za-z]+)', tmpl)): - if entity not in known_entities: - mask[a_id] = 0. - # forbid two api calls in a row - if np.any(state['prev_action']): - prev_act_id = np.argmax(state['prev_action']) - if prev_act_id == self.api_call_id: - mask[prev_act_id] = 0. - return mask - def prepare_data(self, x: List[dict], y: List[dict]) -> List[np.ndarray]: b_features, b_u_masks, b_a_masks, b_actions = [], [], [], [] b_emb_context, b_keys = [], [] # for attention max_num_utter = max(len(d_contexts) for d_contexts in x) for d_contexts, d_responses in zip(x, y): - state = self._zero_state() + self.dialogue_state_tracker.reset_state() d_features, d_a_masks, d_actions = [], [], [] d_emb_context, d_key = [], [] # for attention + for context, response in zip(d_contexts, d_responses): tokens = self.tokenizer([context['text'].lower().strip()])[0] # update state - state['current_db_result'] = context.get('db_result', None) - if state['current_db_result'] is not None: - state['db_result'] = state['current_db_result'] + self.dialogue_state_tracker.get_ground_truth_db_result_from(context) + if callable(self.slot_filler): context_slots = self.slot_filler([tokens])[0] - state['tracker'].update_state(context_slots) + self.dialogue_state_tracker.update_state(context_slots) - features, emb_context, key = self._encode_context(tokens, - state=state) + features, emb_context, key = self._encode_context(tokens, tracker=self.dialogue_state_tracker) d_features.append(features) d_emb_context.append(emb_context) d_key.append(key) - d_a_masks.append(self.calc_action_mask(state)) + d_a_masks.append(self.dialogue_state_tracker.calc_action_mask(self.api_call_id)) action_id = self._encode_response(response['act']) d_actions.append(action_id) # update state # - previous action is teacher-forced here - state['prev_action'] *= 0. - state['prev_action'][action_id] = 1. + self.dialogue_state_tracker.update_previous_action(action_id) if self.debug: log.debug(f"True response = '{response['text']}'.") @@ -434,61 +417,39 @@ def prepare_data(self, x: List[dict], y: List[dict]) -> List[np.ndarray]: def train_on_batch(self, x: List[dict], y: List[dict]) -> dict: return self.network_train_on_batch(*self.prepare_data(x, y)) - def _infer(self, tokens: List[str], state: dict) -> List: - features, emb_context, key = self._encode_context(tokens, state=state) - action_mask = self.calc_action_mask(state) + def _infer(self, tokens: List[str], tracker: DialogueStateTracker) -> List: + features, emb_context, key = self._encode_context(tokens, tracker=tracker) + action_mask = tracker.calc_action_mask(self.api_call_id) probs, state_c, state_h = \ self.network_call([[features]], [[emb_context]], [[key]], - [[action_mask]], [[state['network_state'][0]]], - [[state['network_state'][1]]], + [[action_mask]], [[tracker.network_state[0]]], + [[tracker.network_state[1]]], prob=True) return probs, np.argmax(probs), (state_c, state_h) def _infer_dialog(self, contexts: List[dict]) -> List[str]: res = [] - state = self._zero_state() + self.dialogue_state_tracker.reset_state() for context in contexts: if context.get('prev_resp_act') is not None: - prev_act_id = self._encode_response(context['prev_resp_act']) + previous_act_id = self._encode_response(context['prev_resp_act']) # previous action is teacher-forced - state['prev_action'] *= 0. - state['prev_action'][prev_act_id] = 1. - - state['current_db_result'] = context.get('db_result') - if state['current_db_result'] is not None: - state['db_result'] = state['current_db_result'] + self.dialogue_state_tracker.update_previous_action(previous_act_id) + self.dialogue_state_tracker.get_ground_truth_db_result_from(context) tokens = self.tokenizer([context['text'].lower().strip()])[0] + if callable(self.slot_filler): utter_slots = self.slot_filler([tokens])[0] - state['tracker'].update_state(utter_slots) - _, pred_act_id, state['network_state'] = \ - self._infer(tokens, state=state) - state['prev_action'] *= 0. - state['prev_action'][pred_act_id] = 1. + self.dialogue_state_tracker.update_state(utter_slots) + _, predicted_act_id, self.dialogue_state_tracker.network_state = \ + self._infer(tokens, tracker=self.dialogue_state_tracker) - resp = self._decode_response(pred_act_id, state) + self.dialogue_state_tracker.update_previous_action(predicted_act_id) + resp = self._decode_response(predicted_act_id, self.dialogue_state_tracker) res.append(resp) return res - def make_api_call(self, state: dict) -> dict: - slots = state['tracker'].get_state() - db_results = [] - if self.database is not None: - # filter slot keys with value equal to 'dontcare' as - # there is no such value in database records - # and remove unknown slot keys (for example, 'this' in dstc2 tracker) - db_slots = {s: v for s, v in slots.items() - if (v != 'dontcare') and (s in self.database.keys)} - db_results = self.database([db_slots])[0] - # filter api results if there are more than one - if len(db_results) > 1: - db_results = [r for r in db_results if r != state['db_result']] - else: - log.warning("No database specified.") - log.info(f"Made api_call with {slots}, got {len(db_results)} results.") - return {} if not db_results else db_results[0] - def __call__(self, batch: Union[List[dict], List[str]], user_ids: Optional[List] = None) -> List[str]: @@ -498,54 +459,38 @@ def __call__(self, if not user_ids: user_ids = ['finn'] * len(batch) for user_id, x in zip(user_ids, batch): - if user_id not in self.states: - self.reset(user_id) - state = self.states[user_id] - state['current_db_result'] = None + if not self.multiple_user_state_tracker.check_new_user(user_id): + self.multiple_user_state_tracker.init_new_tracker(user_id, self.dialogue_state_tracker) + tracker = self.multiple_user_state_tracker.get_user_tracker(user_id) tokens = self.tokenizer([x.lower().strip()])[0] + if callable(self.slot_filler): utter_slots = self.slot_filler([tokens])[0] - state['tracker'].update_state(utter_slots) - _, pred_act_id, state['network_state'] = \ - self._infer(tokens, state=state) - state['prev_action'] *= 0. - state['prev_action'][pred_act_id] = 1. + tracker.update_state(utter_slots) + + _, predicted_act_id, tracker.network_state = \ + self._infer(tokens, tracker=tracker) + + tracker.update_previous_action(predicted_act_id) # if made api_call, then respond with next prediction - if pred_act_id == self.api_call_id: - state['current_db_result'] = self.make_api_call(state) - if state['current_db_result'] is not None: - state['db_result'] = state['current_db_result'] - _, pred_act_id, state['network_state'] = \ - self._infer(tokens, state=state) - state['prev_action'] *= 0. - state['prev_action'][pred_act_id] = 1. - - resp = self._decode_response(pred_act_id, state) + if predicted_act_id == self.api_call_id: + tracker.make_api_call() + + _, predicted_act_id, tracker.network_state = \ + self._infer(tokens, tracker=tracker) + + tracker.update_previous_action(predicted_act_id) + + resp = self._decode_response(predicted_act_id, tracker) res.append(resp) - self.states[user_id] = state return res # batch is a list of dialogs, user_ids ignored return [self._infer_dialog(x) for x in batch] - def _zero_state(self) -> dict: - return { - 'tracker': copy.deepcopy(self.default_tracker), - 'db_result': None, - 'current_db_result': None, - 'prev_action': np.zeros(self.n_actions, dtype=np.float32), - 'network_state': ( - np.zeros([1, self.hidden_size], dtype=np.float32), - np.zeros([1, self.hidden_size], dtype=np.float32) - ) - } - def reset(self, user_id: Union[None, str, int] = None) -> None: - if user_id is None: - self.states.clear() - else: - self.states[user_id] = self._zero_state() + self.multiple_user_state_tracker.reset(user_id) if self.debug: log.debug("Bot reset.") diff --git a/deeppavlov/models/go_bot/tracker.py b/deeppavlov/models/go_bot/tracker.py index bcf7c7f866..b8f9f5edaa 100644 --- a/deeppavlov/models/go_bot/tracker.py +++ b/deeppavlov/models/go_bot/tracker.py @@ -13,11 +13,15 @@ # limitations under the License. from abc import ABCMeta, abstractmethod -from typing import List, Dict, Union, Tuple, Any +from logging import getLogger +from typing import List, Dict, Union, Tuple, Any, Iterator import numpy as np from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component + +log = getLogger(__name__) class Tracker(metaclass=ABCMeta): @@ -27,16 +31,10 @@ class Tracker(metaclass=ABCMeta): """ @abstractmethod - def reset_state(self) -> None: - """Resets dialogue state""" - pass - - @abstractmethod - def update_state(self, - slots: Union[List[Tuple[str, Any]], Dict[str, Any]]) -> 'Tracker': + def update_state(self, slots: Union[List[Tuple[str, Any]], Dict[str, Any]]) -> None: """ Updates dialogue state with new ``slots``, calculates features. - + Returns: Tracker: .""" pass @@ -48,6 +46,11 @@ def get_state(self) -> Dict[str, Any]: Dict[str, Any]: dictionary with current slots and their values.""" pass + @abstractmethod + def reset_state(self) -> None: + """Resets dialogue state""" + pass + @abstractmethod def get_features(self) -> np.ndarray: """ @@ -56,61 +59,6 @@ def get_features(self) -> np.ndarray: pass -class DefaultTracker(Tracker): - """ - Tracker that overwrites slots with new values. - Features are binary indicators: slot is present/absent. - - Parameters: - slot_names: list of slots that should be tracked. - """ - - def __init__(self, slot_names: List[str]) -> None: - self.slot_names = list(slot_names) - self.reset_state() - - @property - def state_size(self): - return len(self.slot_names) - - @property - def num_features(self): - return self.state_size - - def reset_state(self): - self.history = [] - self.curr_feats = np.zeros(self.num_features, dtype=np.float32) - - def update_state(self, slots): - def _filter(slots): - return filter(lambda s: s[0] in self.slot_names, slots) - - if isinstance(slots, list): - self.history.extend(_filter(slots)) - elif isinstance(slots, dict): - for slot, value in _filter(slots.items()): - self.history.append((slot, value)) - self.curr_feats = self._binary_features() - return self - - def get_state(self): - lasts = {} - for slot, value in self.history: - lasts[slot] = value - return lasts - - def _binary_features(self): - feats = np.zeros(self.state_size, dtype=np.float32) - lasts = self.get_state() - for i, slot in enumerate(self.slot_names): - if slot in lasts: - feats[i] = 1. - return feats - - def get_features(self): - return self.curr_feats - - @register('featurized_tracker') class FeaturizedTracker(Tracker): """ @@ -125,40 +73,38 @@ class FeaturizedTracker(Tracker): def __init__(self, slot_names: List[str]) -> None: self.slot_names = list(slot_names) - self.reset_state() + self.history = [] + self.current_features = None @property - def state_size(self): + def state_size(self) -> int: return len(self.slot_names) @property - def num_features(self): + def num_features(self) -> int: return self.state_size * 3 + 3 - def reset_state(self): - self.history = [] - self.curr_feats = np.zeros(self.num_features, dtype=np.float32) - def update_state(self, slots): - def _filter(slots): - return filter(lambda s: s[0] in self.slot_names, slots) - - prev_state = self.get_state() if isinstance(slots, list): - self.history.extend(_filter(slots)) + self.history.extend(self._filter(slots)) + elif isinstance(slots, dict): - for slot, value in _filter(slots.items()): + for slot, value in self._filter(slots.items()): self.history.append((slot, value)) + + prev_state = self.get_state() bin_feats = self._binary_features() diff_feats = self._diff_features(prev_state) new_feats = self._new_features(prev_state) - self.curr_feats = np.hstack((bin_feats, - diff_feats, - new_feats, - np.sum(bin_feats), - np.sum(diff_feats), - np.sum(new_feats))) - return self + + self.current_features = np.hstack(( + bin_feats, + diff_feats, + new_feats, + np.sum(bin_feats), + np.sum(diff_feats), + np.sum(new_feats)) + ) def get_state(self): lasts = {} @@ -166,7 +112,17 @@ def get_state(self): lasts[slot] = value return lasts - def _binary_features(self): + def reset_state(self): + self.history = [] + self.current_features = np.zeros(self.num_features, dtype=np.float32) + + def get_features(self): + return self.current_features + + def _filter(self, slots) -> Iterator: + return filter(lambda s: s[0] in self.slot_names, slots) + + def _binary_features(self) -> np.ndarray: feats = np.zeros(self.state_size, dtype=np.float32) lasts = self.get_state() for i, slot in enumerate(self.slot_names): @@ -174,22 +130,136 @@ def _binary_features(self): feats[i] = 1. return feats - def _diff_features(self, state): + def _diff_features(self, state) -> np.ndarray: feats = np.zeros(self.state_size, dtype=np.float32) curr_state = self.get_state() + for i, slot in enumerate(self.slot_names): - if (slot in curr_state) and (slot in state) and \ - (curr_state[slot] != state[slot]): + if slot in curr_state and slot in state and curr_state[slot] != state[slot]: feats[i] = 1. + return feats - def _new_features(self, state): + def _new_features(self, state) -> np.ndarray: feats = np.zeros(self.state_size, dtype=np.float32) curr_state = self.get_state() + for i, slot in enumerate(self.slot_names): - if (slot in curr_state) and (slot not in state): + if slot in curr_state and slot not in state: feats[i] = 1. + return feats - def get_features(self): - return self.curr_feats + +class DialogueStateTracker(FeaturizedTracker): + def __init__(self, slot_names, n_actions: int, hidden_size: int, database: Component = None) -> None: + super().__init__(slot_names) + self.db_result = None + self.current_db_result = None + self.database = database + + self.n_actions = n_actions + self.hidden_size = hidden_size + self.prev_action = np.zeros(n_actions, dtype=np.float32) + + self.network_state = ( + np.zeros([1, hidden_size], dtype=np.float32), + np.zeros([1, hidden_size], dtype=np.float32) + ) + + def reset_state(self): + super().reset_state() + self.db_result = None + self.current_db_result = None + self.prev_action = np.zeros(self.n_actions, dtype=np.float32) + + self.network_state = ( + np.zeros([1, self.hidden_size], dtype=np.float32), + np.zeros([1, self.hidden_size], dtype=np.float32) + ) + + def update_previous_action(self, prev_act_id: int) -> None: + self.prev_action *= 0. + self.prev_action[prev_act_id] = 1. + + def get_ground_truth_db_result_from(self, context: Dict[str, Any]): + self.current_db_result = context.get('db_result', None) + self._update_db_result() + + def make_api_call(self) -> None: + slots = self.get_state() + db_results = [] + if self.database is not None: + + # filter slot keys with value equal to 'dontcare' as + # there is no such value in database records + # and remove unknown slot keys (for example, 'this' in dstc2 tracker) + db_slots = { + s: v for s, v in slots.items() if v != 'dontcare' and s in self.database.keys + } + + db_results = self.database([db_slots])[0] + + # filter api results if there are more than one + # TODO: add sufficient criteria for database results ranking + if len(db_results) > 1: + db_results = [r for r in db_results if r != self.db_result] + else: + log.warning("No database specified.") + + log.info(f"Made api_call with {slots}, got {len(db_results)} results.") + self.current_db_result = {} if not db_results else db_results[0] + self._update_db_result() + + def calc_action_mask(self, api_call_id: int) -> np.ndarray: + mask = np.ones(self.n_actions, dtype=np.float32) + + if np.any(self.prev_action): + prev_act_id = np.argmax(self.prev_action) + if prev_act_id == api_call_id: + mask[prev_act_id] = 0. + + return mask + + def _update_db_result(self): + if self.current_db_result is not None: + self.db_result = self.current_db_result + + +class MultipleUserStateTracker(object): + def __init__(self): + self._ids_to_trackers = {} + + def check_new_user(self, user_id: int) -> bool: + return user_id in self._ids_to_trackers + + def get_user_tracker(self, user_id: int) -> DialogueStateTracker: + if not self.check_new_user(user_id): + raise RuntimeError(f"The user with {user_id} ID is not being tracked") + + tracker = self._ids_to_trackers[user_id] + + # TODO: understand why setting current_db_result to None is necessary + tracker.current_db_result = None + return tracker + + def init_new_tracker(self, user_id: int, tracker_entity: DialogueStateTracker) -> None: + # TODO: implement a better way to init a tracker + tracker = DialogueStateTracker( + tracker_entity.slot_names, + tracker_entity.n_actions, + tracker_entity.hidden_size, + tracker_entity.database + ) + + self._ids_to_trackers[user_id] = tracker + + def reset(self, user_id: int = None) -> None: + if user_id is not None and not self.check_new_user(user_id): + raise RuntimeError(f"The user with {user_id} ID is not being tracked") + + if user_id is not None: + self._ids_to_trackers[user_id].reset_state() + else: + self._ids_to_trackers.clear() + diff --git a/docs/apiref/models/go_bot.rst b/docs/apiref/models/go_bot.rst index 6bc133b4ae..1c117d71b8 100644 --- a/docs/apiref/models/go_bot.rst +++ b/docs/apiref/models/go_bot.rst @@ -8,6 +8,8 @@ deeppavlov.models.go_bot .. autoclass:: deeppavlov.models.go_bot.tracker.Tracker -.. autoclass:: deeppavlov.models.go_bot.tracker.DefaultTracker - .. autoclass:: deeppavlov.models.go_bot.tracker.FeaturizedTracker + +.. autoclass:: deeppavlov.models.go_bot.tracker.DialogueStateTracker + +.. autoclass:: deeppavlov.models.go_bot.tracker.MultipleUserStateTracker From 9890ebd721515241ac50076d266c89f1fdc8f35c Mon Sep 17 00:00:00 2001 From: Mary Trofimova Date: Tue, 25 Feb 2020 19:04:02 +0300 Subject: [PATCH 14/15] feat: add bert embedder on transformers + bert sentence encoders (#1039) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add basic bert embedder class * refactor: rename subword_masks to startofword_markers * feat: return attention mask in preprocessor * feat: rename vars in all bert-ner configs * feat: support word & subword level * fix: markers construction fix * feat: add word & subword level configs * feat: tokens&subtokens as input, output is dict * feat: add sep and max embs * feat: add mean pool embs * fix: level check fix * feat: rewrite sep+cls & add load warning * feat: all embedder configs in configs/embedder * fix: malformed table in docs * refactor: add parameter init in config * feat: two configs for token and sent-level * feat: add sentence bert links to doc * feat: add sentence bert to pretrained_vectors page * refactor: add links in docs * feat: add link to slavicbert paper * feat: sentence bert link in config * feat: bert embedder configs doc * fix: remove code duplication caused by merging * style: correct typings for bert_embedder * fix: new output for bert_ner_preprocessor moved to the back * feat: add transformers_preprocessor.py * fix: token_maksing_prob -> token_masking_prob * fix: correctly parse bert vocab path from config * feat: add a basic bert_embedder on transformers * feat: add mean and max embeddings to transformers_embedder.py * feat: calculate word embeddings with torch in transformers_embedder.py and zero_pad by default * feat: add configs with bert_embeddings * tests: add tests for bert_embedder configurations * fix: correct param names for disabling zero padding * docs: remove older bert-embedder and update documentation * chore: update transformers version requirement * fix: use correct device * chore: remove torchvision from requirements * fix: mention classes instead of intents in classification model * docs: add links for bert models converted to pytorch * docs: add a link on Transformers library * feat: return tokens and subtokens in bert_embedder.json * feat: add a bert sentence embedder configuration * docs: add links on toch bert models to pretrained_vectors.rst * docs: inited → initialized * chore: version 0.8.0 Co-authored-by: Aleksei Lymar --- deeppavlov/__init__.py | 2 +- .../sentiment_twitter_bert_emb.json | 149 ++++++++++++++++++ .../configs/embedder/bert_embedder.json | 43 +++++ .../embedder/bert_sentence_embedder.json | 43 +++++ .../elmo_en_1billion.json | 0 .../elmo_ru_news.json | 0 .../elmo_ru_twitter.json | 0 .../elmo_ru_wiki.json | 0 .../tfidf_vectorizer.json | 0 .../BERT/morpho_ru_syntagrus_bert.json | 9 +- .../configs/ner/ner_conll2003_bert.json | 9 +- deeppavlov/configs/ner/ner_ontonotes.json | 17 +- .../configs/ner/ner_ontonotes_bert.json | 9 +- .../configs/ner/ner_ontonotes_bert_emb.json | 126 +++++++++++++++ .../configs/ner/ner_ontonotes_bert_mult.json | 9 +- deeppavlov/configs/ner/ner_rus_bert.json | 9 +- .../syntax/syntax_ru_syntagrus_bert.json | 9 +- deeppavlov/core/common/registry.json | 9 +- .../classifiers/keras_classification_model.py | 2 +- .../models/embedders/transformers_embedder.py | 94 +++++++++++ .../models/preprocessors/bert_preprocessor.py | 74 +++++---- .../transformers_preprocessor.py | 76 +++++++++ deeppavlov/requirements/pytorch.txt | 1 + deeppavlov/requirements/tf-gpu.txt | 2 +- deeppavlov/requirements/tf.txt | 2 +- deeppavlov/requirements/transformers.txt | 1 + docs/apiref/models/embedders.rst | 4 + docs/conf.py | 5 +- docs/features/models/bert.rst | 90 ++++++++--- docs/features/pretrained_vectors.rst | 59 ++++--- tests/test_quick_start.py | 4 +- 31 files changed, 722 insertions(+), 135 deletions(-) create mode 100644 deeppavlov/configs/classifiers/sentiment_twitter_bert_emb.json create mode 100644 deeppavlov/configs/embedder/bert_embedder.json create mode 100644 deeppavlov/configs/embedder/bert_sentence_embedder.json rename deeppavlov/configs/{elmo_embedder => embedder}/elmo_en_1billion.json (100%) rename deeppavlov/configs/{elmo_embedder => embedder}/elmo_ru_news.json (100%) rename deeppavlov/configs/{elmo_embedder => embedder}/elmo_ru_twitter.json (100%) rename deeppavlov/configs/{elmo_embedder => embedder}/elmo_ru_wiki.json (100%) rename deeppavlov/configs/{vectorizer => embedder}/tfidf_vectorizer.json (100%) create mode 100644 deeppavlov/configs/ner/ner_ontonotes_bert_emb.json create mode 100644 deeppavlov/models/embedders/transformers_embedder.py create mode 100644 deeppavlov/models/preprocessors/transformers_preprocessor.py create mode 100644 deeppavlov/requirements/pytorch.txt create mode 100644 deeppavlov/requirements/transformers.txt diff --git a/deeppavlov/__init__.py b/deeppavlov/__init__.py index 02ab1d7808..ce7c03b1f1 100644 --- a/deeppavlov/__init__.py +++ b/deeppavlov/__init__.py @@ -39,7 +39,7 @@ def evaluate_model(config: [str, Path, dict], download: bool = False, recursive: except ImportError: 'Assuming that requirements are not yet installed' -__version__ = '0.7.1' +__version__ = '0.8.0' __author__ = 'Neural Networks and Deep Learning lab, MIPT' __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.' __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot'] diff --git a/deeppavlov/configs/classifiers/sentiment_twitter_bert_emb.json b/deeppavlov/configs/classifiers/sentiment_twitter_bert_emb.json new file mode 100644 index 0000000000..9552132c68 --- /dev/null +++ b/deeppavlov/configs/classifiers/sentiment_twitter_bert_emb.json @@ -0,0 +1,149 @@ +{ + "dataset_reader": { + "class_name": "basic_classification_reader", + "x": "Twit", + "y": "Class", + "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42 + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y" + ], + "pipe": [ + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": [ + "y" + ], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": "y", + "out": "y_ids" + }, + { + "class_name": "transformers_bert_preprocessor", + "vocab_file": "{BERT_PATH}/vocab.txt", + "do_lower_case": false, + "max_seq_length": 512, + "in": ["x"], + "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"] + }, + { + "class_name": "transformers_bert_embedder", + "id": "my_embedder", + "bert_config_path": "{BERT_PATH}/bert_config.json", + "truncate": false, + "load_path": "{BERT_PATH}", + "in": ["subword_tok_ids", "startofword_markers", "attention_mask"], + "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"] + }, + { + "in": "y_ids", + "out": "y_onehot", + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "in": [ + "word_emb" + ], + "in_y": [ + "y_onehot" + ], + "out": [ + "y_pred_probas" + ], + "main": true, + "class_name": "keras_classification_model", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "embedding_size": "#my_embedder.dim", + "n_classes": "#classes_vocab.len", + "kernel_sizes_cnn": [ + 3, + 5, + 7 + ], + "filters_cnn": 256, + "optimizer": "Adam", + "learning_rate": 0.01, + "learning_rate_decay": 0.1, + "loss": "binary_crossentropy", + "last_layer_activation": "softmax", + "coef_reg_cnn": 1e-3, + "coef_reg_den": 1e-2, + "dropout_rate": 0.5, + "dense_size": 100, + "model_name": "cnn_model" + }, + { + "in": "y_pred_probas", + "out": "y_pred_ids", + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": "y_pred_ids", + "out": "y_pred_labels", + "ref": "classes_vocab" + } + ], + "out": [ + "y_pred_labels" + ] + }, + "train": { + "epochs": 100, + "batch_size": 64, + "metrics": [ + "accuracy", + "f1_macro", + { + "name": "roc_auc", + "inputs": ["y_onehot", "y_pred_probas"] + } + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": [ + "valid", + "test" + ], + "class_name": "nn_trainer" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_twitter_bert_emb", + "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_pt" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/tf.txt", + "{DEEPPAVLOV_PATH}/requirements/transformers.txt", + "{DEEPPAVLOV_PATH}/requirements/pytorch.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/datasets/sentiment_twitter_data.tar.gz", + "subdir": "{DOWNLOADS_PATH}" + }, + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz", + "subdir": "{DOWNLOADS_PATH}/bert_models" + } + ] + } +} diff --git a/deeppavlov/configs/embedder/bert_embedder.json b/deeppavlov/configs/embedder/bert_embedder.json new file mode 100644 index 0000000000..99282cf7f3 --- /dev/null +++ b/deeppavlov/configs/embedder/bert_embedder.json @@ -0,0 +1,43 @@ +{ + "chainer": { + "in": ["texts"], + "pipe": [ + { + "class_name": "transformers_bert_preprocessor", + "vocab_file": "{BERT_PATH}/vocab.txt", + "do_lower_case": false, + "max_seq_length": 512, + "in": ["texts"], + "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"] + }, + { + "class_name": "transformers_bert_embedder", + "bert_config_path": "{BERT_PATH}/bert_config.json", + "load_path": "{BERT_PATH}", + "truncate": true, + "in": ["subword_tok_ids", "startofword_markers", "attention_mask"], + "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"] + } + ], + "out": ["tokens", "word_emb", "subword_tokens", "subword_emb", "max_emb", "mean_emb", "pooler_output"] + }, + "train": {}, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12_pt" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/transformers.txt", + "{DEEPPAVLOV_PATH}/requirements/pytorch.txt" + ], + "labels": {}, + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12_pt.tar.gz", + "subdir": "{DOWNLOADS_PATH}/bert_models" + } + ] + } +} diff --git a/deeppavlov/configs/embedder/bert_sentence_embedder.json b/deeppavlov/configs/embedder/bert_sentence_embedder.json new file mode 100644 index 0000000000..3ea12b3cfc --- /dev/null +++ b/deeppavlov/configs/embedder/bert_sentence_embedder.json @@ -0,0 +1,43 @@ +{ + "chainer": { + "in": ["texts"], + "pipe": [ + { + "class_name": "transformers_bert_preprocessor", + "vocab_file": "{BERT_PATH}/vocab.txt", + "do_lower_case": false, + "max_seq_length": 512, + "in": ["texts"], + "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"] + }, + { + "class_name": "transformers_bert_embedder", + "bert_config_path": "{BERT_PATH}/bert_config.json", + "load_path": "{BERT_PATH}", + "truncate": false, + "in": ["subword_tok_ids", "startofword_markers", "attention_mask"], + "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"] + } + ], + "out": ["max_emb", "mean_emb", "pooler_output"] + }, + "train": {}, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/sentence_multi_cased_L-12_H-768_A-12_pt" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/transformers.txt", + "{DEEPPAVLOV_PATH}/requirements/pytorch.txt" + ], + "labels": {}, + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt.tar.gz", + "subdir": "{DOWNLOADS_PATH}/bert_models" + } + ] + } +} diff --git a/deeppavlov/configs/elmo_embedder/elmo_en_1billion.json b/deeppavlov/configs/embedder/elmo_en_1billion.json similarity index 100% rename from deeppavlov/configs/elmo_embedder/elmo_en_1billion.json rename to deeppavlov/configs/embedder/elmo_en_1billion.json diff --git a/deeppavlov/configs/elmo_embedder/elmo_ru_news.json b/deeppavlov/configs/embedder/elmo_ru_news.json similarity index 100% rename from deeppavlov/configs/elmo_embedder/elmo_ru_news.json rename to deeppavlov/configs/embedder/elmo_ru_news.json diff --git a/deeppavlov/configs/elmo_embedder/elmo_ru_twitter.json b/deeppavlov/configs/embedder/elmo_ru_twitter.json similarity index 100% rename from deeppavlov/configs/elmo_embedder/elmo_ru_twitter.json rename to deeppavlov/configs/embedder/elmo_ru_twitter.json diff --git a/deeppavlov/configs/elmo_embedder/elmo_ru_wiki.json b/deeppavlov/configs/embedder/elmo_ru_wiki.json similarity index 100% rename from deeppavlov/configs/elmo_embedder/elmo_ru_wiki.json rename to deeppavlov/configs/embedder/elmo_ru_wiki.json diff --git a/deeppavlov/configs/vectorizer/tfidf_vectorizer.json b/deeppavlov/configs/embedder/tfidf_vectorizer.json similarity index 100% rename from deeppavlov/configs/vectorizer/tfidf_vectorizer.json rename to deeppavlov/configs/embedder/tfidf_vectorizer.json diff --git a/deeppavlov/configs/morpho_tagger/BERT/morpho_ru_syntagrus_bert.json b/deeppavlov/configs/morpho_tagger/BERT/morpho_ru_syntagrus_bert.json index 58c9db617c..7a977724cd 100644 --- a/deeppavlov/configs/morpho_tagger/BERT/morpho_ru_syntagrus_bert.json +++ b/deeppavlov/configs/morpho_tagger/BERT/morpho_ru_syntagrus_bert.json @@ -32,12 +32,7 @@ "subword_mask_mode": "last", "token_masking_prob": 0.0, "in": ["x_words"], - "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "pred_subword_mask"] - }, - { - "class_name": "mask", - "in": ["x_subword_tokens"], - "out": ["x_subword_mask"] + "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask"] }, { "id": "tag_vocab", @@ -77,7 +72,7 @@ "clip_norm": null, "save_path": "{WORK_PATH}/model", "load_path": "{WORK_PATH}/model", - "in": ["x_subword_tok_ids", "x_subword_mask", "pred_subword_mask"], + "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_predicted_ind"] }, diff --git a/deeppavlov/configs/ner/ner_conll2003_bert.json b/deeppavlov/configs/ner/ner_conll2003_bert.json index ae17c3b34a..c98af28a85 100644 --- a/deeppavlov/configs/ner/ner_conll2003_bert.json +++ b/deeppavlov/configs/ner/ner_conll2003_bert.json @@ -20,12 +20,7 @@ "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x"], - "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "pred_subword_mask"] - }, - { - "class_name": "mask", - "in": ["x_subword_tokens"], - "out": ["x_subword_mask"] + "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask"] }, { "id": "tag_vocab", @@ -59,7 +54,7 @@ "clip_norm": 1.0, "save_path": "{NER_PATH}/model", "load_path": "{NER_PATH}/model", - "in": ["x_subword_tok_ids", "x_subword_mask", "pred_subword_mask"], + "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind"] }, diff --git a/deeppavlov/configs/ner/ner_ontonotes.json b/deeppavlov/configs/ner/ner_ontonotes.json index a0a2a3114f..df65ff66fd 100644 --- a/deeppavlov/configs/ner/ner_ontonotes.json +++ b/deeppavlov/configs/ner/ner_ontonotes.json @@ -34,8 +34,8 @@ "class_name": "simple_vocab", "pad_with_zeros": true, "fit_on": ["y"], - "save_path": "{MODELS_PATH}/ner_ontonotes/tag.dict", - "load_path": "{MODELS_PATH}/ner_ontonotes/tag.dict", + "save_path": "{MODEL_PATH}/tag.dict", + "load_path": "{MODEL_PATH}/tag.dict", "out": ["y_ind"] }, { @@ -49,8 +49,8 @@ "class_name": "simple_vocab", "pad_with_zeros": true, "fit_on": ["x_char"], - "save_path": "{MODELS_PATH}/ner_ontonotes/char.dict", - "load_path": "{MODELS_PATH}/ner_ontonotes/char.dict", + "save_path": "{MODEL_PATH}/char.dict", + "load_path": "{MODEL_PATH}/char.dict", "out": ["x_char_ind"] }, { @@ -95,8 +95,8 @@ "n_tags": "#tag_vocab.len", "capitalization_dim": "#capitalization.dim", "char_emb_dim": "#embeddings_char.dim", - "save_path": "{MODELS_PATH}/ner_ontonotes/model", - "load_path": "{MODELS_PATH}/ner_ontonotes/model", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", "char_emb_mat": "#embeddings_char.emb_mat", "two_dense_on_top": true, "use_crf": true, @@ -136,7 +136,7 @@ "val_every_n_epochs": 1, "log_every_n_batches": -1, - "tensorboard_log_dir": "{MODELS_PATH}/ner_ontonotes/logs", + "tensorboard_log_dir": "{MODEL_PATH}/logs", "show_examples": false, "class_name": "nn_trainer", "evaluation_targets": [ @@ -148,7 +148,8 @@ "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/ner_ontonotes" }, "requirements": [ "{DEEPPAVLOV_PATH}/requirements/gensim.txt", diff --git a/deeppavlov/configs/ner/ner_ontonotes_bert.json b/deeppavlov/configs/ner/ner_ontonotes_bert.json index 1dd1f1c515..a0a9595f16 100644 --- a/deeppavlov/configs/ner/ner_ontonotes_bert.json +++ b/deeppavlov/configs/ner/ner_ontonotes_bert.json @@ -20,12 +20,7 @@ "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x"], - "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "pred_subword_mask"] - }, - { - "class_name": "mask", - "in": ["x_subword_tokens"], - "out": ["x_subword_mask"] + "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask"] }, { "id": "tag_vocab", @@ -59,7 +54,7 @@ "clip_norm": 1.0, "save_path": "{NER_PATH}/model", "load_path": "{NER_PATH}/model", - "in": ["x_subword_tok_ids", "x_subword_mask", "pred_subword_mask"], + "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind"] }, diff --git a/deeppavlov/configs/ner/ner_ontonotes_bert_emb.json b/deeppavlov/configs/ner/ner_ontonotes_bert_emb.json new file mode 100644 index 0000000000..2319436e6c --- /dev/null +++ b/deeppavlov/configs/ner/ner_ontonotes_bert_emb.json @@ -0,0 +1,126 @@ +{ + "dataset_reader": { + "class_name": "conll2003_reader", + "data_path": "{DOWNLOADS_PATH}/ontonotes", + "dataset_name": "ontonotes" + }, + "dataset_iterator": { + "class_name": "data_learning_iterator", + "seed": 42 + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "transformers_bert_preprocessor", + "vocab_file": "{BERT_PATH}/vocab.txt", + "do_lower_case": false, + "max_seq_length": 512, + "in": ["x"], + "out": ["x_tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"] + }, + { + "in": ["y"], + "id": "tag_vocab", + "class_name": "simple_vocab", + "pad_with_zeros": true, + "fit_on": ["y"], + "save_path": "{MODEL_PATH}/tag.dict", + "load_path": "{MODEL_PATH}/tag.dict", + "out": ["y_ind"] + }, + { + "in": ["x_tokens"], + "class_name": "mask", + "out": ["mask"] + }, + { + "class_name": "transformers_bert_embedder", + "id": "embedder", + "bert_config_path": "{BERT_PATH}/bert_config.json", + "truncate": false, + "load_path": "{BERT_PATH}", + "in": ["subword_tok_ids", "startofword_markers", "attention_mask"], + "out": ["x_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"] + }, + { + "in": ["x_emb", "mask"], + "in_y": ["y_ind"], + "out": ["y_predicted"], + "class_name": "ner", + "main": true, + "token_emb_dim": "#embedder.dim", + "n_hidden_list": [256, 256, 256], + "net_type": "rnn", + "cell_type": "lstm", + "use_cudnn_rnn": true, + "n_tags": "#tag_vocab.len", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "two_dense_on_top": true, + "use_crf": true, + "use_batch_norm": true, + "embeddings_dropout": true, + "top_dropout": true, + "intra_layer_dropout": false, + "l2_reg": 0, + "learning_rate": 3e-3, + "learning_rate_drop_patience": 3, + "dropout_keep_prob": 0.7 + }, + { + "ref": "tag_vocab", + "in": ["y_predicted"], + "out": ["tags"] + } + ], + + "out": ["x_tokens", "tags"] + }, + "train": { + "epochs": 100, + "batch_size": 64, + + "metrics": [ + { + "name": "ner_f1", + "inputs": ["y", "tags"] + }, + { + "name": "ner_token_f1", + "inputs": ["y", "tags"] + } + ], + "validation_patience": 7, + "val_every_n_epochs": 1, + + "log_every_n_batches": -1, + "tensorboard_log_dir": "{MODEL_PATH}/logs", + "show_examples": false, + "class_name": "nn_trainer", + "evaluation_targets": [ + "valid", + "test" + ] + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/ner_ontonotes_bert_emb", + "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12_pt" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/gensim.txt", + "{DEEPPAVLOV_PATH}/requirements/tf.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12_pt.tar.gz", + "subdir": "{DOWNLOADS_PATH}/bert_models" + } + ] + } +} diff --git a/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json b/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json index bda3dd30df..a596a7599d 100644 --- a/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json +++ b/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json @@ -20,12 +20,7 @@ "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x"], - "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "pred_subword_mask"] - }, - { - "class_name": "mask", - "in": ["x_subword_tokens"], - "out": ["x_subword_mask"] + "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask"] }, { "id": "tag_vocab", @@ -59,7 +54,7 @@ "clip_norm": 1.0, "save_path": "{NER_PATH}/model", "load_path": "{NER_PATH}/model", - "in": ["x_subword_tok_ids", "x_subword_mask", "pred_subword_mask"], + "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind"] }, diff --git a/deeppavlov/configs/ner/ner_rus_bert.json b/deeppavlov/configs/ner/ner_rus_bert.json index 0b8c7e9848..620d42d7aa 100644 --- a/deeppavlov/configs/ner/ner_rus_bert.json +++ b/deeppavlov/configs/ner/ner_rus_bert.json @@ -20,12 +20,7 @@ "max_subword_length": 15, "token_masking_prob": 0.0, "in": ["x"], - "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "pred_subword_mask"] - }, - { - "class_name": "mask", - "in": ["x_subword_tokens"], - "out": ["x_subword_mask"] + "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask"] }, { "id": "tag_vocab", @@ -59,7 +54,7 @@ "clip_norm": null, "save_path": "{NER_PATH}/model", "load_path": "{NER_PATH}/model", - "in": ["x_subword_tok_ids", "x_subword_mask", "pred_subword_mask"], + "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_ind"], "out": ["y_pred_ind"] }, diff --git a/deeppavlov/configs/syntax/syntax_ru_syntagrus_bert.json b/deeppavlov/configs/syntax/syntax_ru_syntagrus_bert.json index 6ed421b605..4e7c018f98 100644 --- a/deeppavlov/configs/syntax/syntax_ru_syntagrus_bert.json +++ b/deeppavlov/configs/syntax/syntax_ru_syntagrus_bert.json @@ -33,12 +33,7 @@ "subword_mask_mode": "last", "token_masking_prob": 0.0, "in": ["x_words"], - "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "pred_subword_mask"] - }, - { - "class_name": "mask", - "in": ["x_subword_tokens"], - "out": ["x_subword_mask"] + "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask"] }, { "id": "dep_vocab", @@ -78,7 +73,7 @@ "clip_norm": null, "save_path": "{WORK_PATH}/model_joint", "load_path": "{WORK_PATH}/model_joint", - "in": ["x_subword_tok_ids", "x_subword_mask", "pred_subword_mask"], + "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], "in_y": ["y_heads", "y_deps_indexes"], "out": ["y_predicted_heads_probs", "y_predicted_deps_indexes"] }, diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 92e97c955f..75374af3d5 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -8,8 +8,6 @@ "basic_classification_reader": "deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader", "bert_as_summarizer": "deeppavlov.models.bert.bert_as_summarizer:BertAsSummarizer", "bert_classifier": "deeppavlov.models.bert.bert_classifier:BertClassifierModel", - "bert_sequence_tagger": "deeppavlov.models.bert.bert_sequence_tagger:BertSequenceTagger", - "bert_syntax_parser": "deeppavlov.models.syntax_parser.network:BertSyntaxParser", "bert_ner_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertNerPreprocessor", "bert_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertPreprocessor", "bert_ranker": "deeppavlov.models.bert.bert_ranker:BertRankerModel", @@ -18,6 +16,9 @@ "bert_sep_ranker_predictor": "deeppavlov.models.bert.bert_ranker:BertSepRankerPredictor", "bert_sep_ranker_predictor_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertSepRankerPredictorPreprocessor", "bert_sep_ranker_preprocessor": "deeppavlov.models.preprocessors.bert_preprocessor:BertSepRankerPreprocessor", + "bert_sequence_network": "deeppavlov.models.bert.bert_sequence_tagger:BertSequenceNetwork", + "bert_sequence_tagger": "deeppavlov.models.bert.bert_sequence_tagger:BertSequenceTagger", + "bert_syntax_parser": "deeppavlov.models.syntax_parser.network:BertSyntaxParser", "bilstm_gru_nn": "deeppavlov.models.ranking.bilstm_gru_siamese_network:BiLSTMGRUSiameseNetwork", "bilstm_nn": "deeppavlov.models.ranking.bilstm_siamese_network:BiLSTMSiameseNetwork", "bow": "deeppavlov.models.embedders.bow_embedder:BoWEmbedder", @@ -31,6 +32,7 @@ "dam_nn_use_transformer": "deeppavlov.models.ranking.deep_attention_matching_network_use_transformer:DAMNetworkUSETransformer", "data_fitting_iterator": "deeppavlov.core.data.data_fitting_iterator:DataFittingIterator", "data_learning_iterator": "deeppavlov.core.data.data_learning_iterator:DataLearningIterator", + "dependency_output_prettifier": "deeppavlov.models.morpho_tagger.common:DependencyOutputPrettifier", "dialog_db_result_iterator": "deeppavlov.dataset_iterators.dialog_iterator:DialogDBResultDatasetIterator", "dialog_iterator": "deeppavlov.dataset_iterators.dialog_iterator:DialogDatasetIterator", "dialog_state": "deeppavlov.models.seq2seq_go_bot.dialog_state:DialogState", @@ -69,7 +71,6 @@ "kvret_dialog_iterator": "deeppavlov.dataset_iterators.kvret_dialog_iterator:KvretDialogDatasetIterator", "kvret_reader": "deeppavlov.dataset_readers.kvret_reader:KvretDatasetReader", "lazy_tokenizer": "deeppavlov.models.tokenizers.lazy_tokenizer:LazyTokenizer", - "dependency_output_prettifier": "deeppavlov.models.morpho_tagger.common:DependencyOutputPrettifier", "lemmatized_output_prettifier": "deeppavlov.models.morpho_tagger.common:LemmatizedOutputPrettifier", "line_reader": "deeppavlov.dataset_readers.line_reader:LineReader", "logit_ranker": "deeppavlov.models.doc_retrieval.logit_ranker:LogitRanker", @@ -150,6 +151,8 @@ "tfidf_ranker": "deeppavlov.models.doc_retrieval.tfidf_ranker:TfidfRanker", "tfidf_weighted": "deeppavlov.models.embedders.tfidf_weighted_embedder:TfidfWeightedEmbedder", "top1_elector": "deeppavlov.models.spelling_correction.electors.top1_elector:TopOneElector", + "transformers_bert_embedder": "deeppavlov.models.embedders.transformers_embedder:TransformersBertEmbedder", + "transformers_bert_preprocessor": "deeppavlov.models.preprocessors.transformers_preprocessor:TransformersBertPreprocessor", "typos_custom_reader": "deeppavlov.dataset_readers.typos_reader:TyposCustom", "typos_iterator": "deeppavlov.dataset_iterators.typos_iterator:TyposDatasetIterator", "typos_kartaslov_reader": "deeppavlov.dataset_readers.typos_reader:TyposKartaslov", diff --git a/deeppavlov/models/classifiers/keras_classification_model.py b/deeppavlov/models/classifiers/keras_classification_model.py index 1bffd9c9ac..fe4ced95c3 100644 --- a/deeppavlov/models/classifiers/keras_classification_model.py +++ b/deeppavlov/models/classifiers/keras_classification_model.py @@ -120,7 +120,7 @@ def __init__(self, embedding_size: int, n_classes: int, self.n_classes = self.opt.get('n_classes') if self.n_classes == 0: - raise ConfigError("Please, provide vocabulary with considered intents.") + raise ConfigError("Please, provide vocabulary with considered classes.") self.load() diff --git a/deeppavlov/models/embedders/transformers_embedder.py b/deeppavlov/models/embedders/transformers_embedder.py new file mode 100644 index 0000000000..6a1fd40d6c --- /dev/null +++ b/deeppavlov/models/embedders/transformers_embedder.py @@ -0,0 +1,94 @@ +# Copyright 2020 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path +from typing import Union, Tuple, Collection + +import torch +import transformers + +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.serializable import Serializable + + +@register('transformers_bert_embedder') +class TransformersBertEmbedder(Serializable): + """Transformers-based BERT model for embeddings tokens, subtokens and sentences + + Args: + load_path: path to a pretrained BERT pytorch checkpoint + bert_config_file: path to a BERT configuration file + truncate: whether to remove zero-paddings from returned data + + """ + model: transformers.BertModel + dim: int + + def __init__(self, load_path: Union[str, Path], bert_config_path: Union[str, Path] = None, + truncate: bool = False, **kwargs): + super().__init__(save_path=None, load_path=load_path, **kwargs) + if bert_config_path is not None: + bert_config_path = expand_path(bert_config_path) + self.config = bert_config_path + self.truncate = truncate + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.load() + + def save(self, *args, **kwargs): + raise NotImplementedError + + def load(self): + self.model = transformers.BertModel.from_pretrained(self.load_path, config=self.config).eval().to(self.device) + self.dim = self.model.config.hidden_size + + def __call__(self, subtoken_ids_batch: Collection[Collection[int]], startofwords_batch: Collection[Collection[int]], + attention_batch: Collection[Collection[int]]) -> Tuple[Collection[Collection[Collection[float]]], + Collection[Collection[Collection[float]]], + Collection[Collection[float]], + Collection[Collection[float]], + Collection[Collection[float]]]: + """Predict embeddings values for a given batch + + Args: + subtoken_ids_batch: padded indexes for every subtoken + startofwords_batch: a mask matrix with ``1`` for every first subtoken init in a token and ``0`` + for every other subtoken + attention_batch: a mask matrix with ``1`` for every significant subtoken and ``0`` for paddings + """ + ids_tensor = torch.tensor(subtoken_ids_batch, device=self.device) + startofwords_tensor = torch.tensor(startofwords_batch, device=self.device).bool() + attention_tensor = torch.tensor(attention_batch, device=self.device) + with torch.no_grad(): + last_hidden, pooler_output = self.model(ids_tensor, attention_tensor) + attention_tensor = attention_tensor.unsqueeze(-1) + max_emb = torch.max(last_hidden - 1e9 * (1 - attention_tensor), dim=1)[0] + subword_emb = last_hidden * attention_tensor + mean_emb = torch.sum(subword_emb, dim=1) / torch.sum(attention_tensor, dim=1) + + tokens_lengths = startofwords_tensor.sum(dim=1) + word_emb = torch.zeros((subword_emb.shape[0], tokens_lengths.max(), subword_emb.shape[2]), + device=self.device, dtype=subword_emb.dtype) + target_indexes = (torch.arange(word_emb.shape[1], device=self.device).expand(word_emb.shape[:-1]) < + tokens_lengths.unsqueeze(-1)) + word_emb[target_indexes] = subword_emb[startofwords_tensor] + + subword_emb = subword_emb.cpu().numpy() + word_emb = word_emb.cpu().numpy() + pooler_output = pooler_output.cpu().numpy() + max_emb = max_emb.cpu().numpy() + mean_emb = mean_emb.cpu().numpy() + if self.truncate: + subword_emb = [item[:mask.sum()] for item, mask in zip(subword_emb, attention_batch)] + word_emb = [item[:mask.sum()] for item, mask in zip(word_emb, startofwords_batch)] + return word_emb, subword_emb, max_emb, mean_emb, pooler_output diff --git a/deeppavlov/models/preprocessors/bert_preprocessor.py b/deeppavlov/models/preprocessors/bert_preprocessor.py index ed6241fd7f..e60a068193 100644 --- a/deeppavlov/models/preprocessors/bert_preprocessor.py +++ b/deeppavlov/models/preprocessors/bert_preprocessor.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import re +import random from logging import getLogger from typing import Tuple, List, Optional, Union -import numpy as np from bert_dp.preprocessing import convert_examples_to_features, InputExample, InputFeatures from bert_dp.tokenization import FullTokenizer @@ -23,6 +23,7 @@ from deeppavlov.core.common.registry import register from deeppavlov.core.data.utils import zero_pad from deeppavlov.core.models.component import Component +from deeppavlov.models.preprocessors.mask import Mask log = getLogger(__name__) @@ -125,88 +126,91 @@ def __call__(self, **kwargs): if isinstance(tokens[0], str): tokens = [re.findall(self._re_tokenizer, s) for s in tokens] - subword_tokens, subword_tok_ids, subword_masks, subword_tags = [], [], [], [] + subword_tokens, subword_tok_ids, startofword_markers, subword_tags = [], [], [], [] for i in range(len(tokens)): toks = tokens[i] ys = ['O'] * len(toks) if tags is None else tags[i] - mask = [int(y != 'X') for y in ys] - assert len(toks) == len(ys) == len(mask), \ - f"toks({len(toks)}) should have the same length as " \ - f" ys({len(ys)}) and mask({len(mask)}), tokens = {toks}." - sw_toks, sw_mask, sw_ys = self._ner_bert_tokenize(toks, - mask, - ys, - self.tokenizer, - self.max_subword_length, - mode=self.mode, - subword_mask_mode=self.subword_mask_mode, - token_masking_prob=self.token_masking_prob) + assert len(toks) == len(ys), \ + f"toks({len(toks)}) should have the same length as ys({len(ys)})" + sw_toks, sw_marker, sw_ys = \ + self._ner_bert_tokenize(toks, + ys, + self.tokenizer, + self.max_subword_length, + mode=self.mode, + subword_mask_mode=self.subword_mask_mode, + token_masking_prob=self.token_masking_prob) if self.max_seq_length is not None: if len(sw_toks) > self.max_seq_length: raise RuntimeError(f"input sequence after bert tokenization" f" shouldn't exceed {self.max_seq_length} tokens.") subword_tokens.append(sw_toks) subword_tok_ids.append(self.tokenizer.convert_tokens_to_ids(sw_toks)) - subword_masks.append(sw_mask) + startofword_markers.append(sw_marker) subword_tags.append(sw_ys) - assert len(sw_mask) == len(sw_toks) == len(subword_tok_ids[-1]) == len(sw_ys), \ - f"length of mask({len(sw_mask)}), tokens({len(sw_toks)})," \ + assert len(sw_marker) == len(sw_toks) == len(subword_tok_ids[-1]) == len(sw_ys), \ + f"length of sow_marker({len(sw_marker)}), tokens({len(sw_toks)})," \ f" token ids({len(subword_tok_ids[-1])}) and ys({len(ys)})" \ f" for tokens = `{toks}` should match" subword_tok_ids = zero_pad(subword_tok_ids, dtype=int, padding=0) - subword_masks = zero_pad(subword_masks, dtype=int, padding=0) + startofword_markers = zero_pad(startofword_markers, dtype=int, padding=0) + attention_mask = Mask()(subword_tokens) + if tags is not None: if self.provide_subword_tags: - return tokens, subword_tokens, subword_tok_ids, subword_masks, subword_tags + return tokens, subword_tokens, subword_tok_ids, \ + attention_mask, startofword_markers, subword_tags else: nonmasked_tags = [[t for t in ts if t != 'X'] for ts in tags] for swts, swids, swms, ts in zip(subword_tokens, subword_tok_ids, - subword_masks, + startofword_markers, nonmasked_tags): if (len(swids) != len(swms)) or (len(ts) != sum(swms)): log.warning('Not matching lengths of the tokenization!') log.warning(f'Tokens len: {len(swts)}\n Tokens: {swts}') - log.warning(f'Masks len: {len(swms)}, sum: {sum(swms)}') + log.warning(f'Markers len: {len(swms)}, sum: {sum(swms)}') log.warning(f'Masks: {swms}') log.warning(f'Tags len: {len(ts)}\n Tags: {ts}') - return tokens, subword_tokens, subword_tok_ids, subword_masks, nonmasked_tags - return tokens, subword_tokens, subword_tok_ids, subword_masks + return tokens, subword_tokens, subword_tok_ids, \ + attention_mask, startofword_markers, nonmasked_tags + return tokens, subword_tokens, subword_tok_ids, startofword_markers, attention_mask @staticmethod def _ner_bert_tokenize(tokens: List[str], - mask: List[int], tags: List[str], tokenizer: FullTokenizer, max_subword_len: int = None, mode: str = None, subword_mask_mode: str = "first", - token_masking_prob: float = 0.0) -> Tuple[List[str], List[int], List[str]]: + token_masking_prob: float = None) -> Tuple[List[str], List[int], List[str]]: + do_masking = (mode == 'train') and (token_masking_prob is not None) + do_cutting = (max_subword_len is not None) tokens_subword = ['[CLS]'] - mask_subword = [0] + startofword_markers = [0] tags_subword = ['X'] - for token, flag, tag in zip(tokens, mask, tags): + for token, tag in zip(tokens, tags): + token_marker = int(tag != 'X') subwords = tokenizer.tokenize(token) - if not subwords or \ - ((max_subword_len is not None) and (len(subwords) > max_subword_len)): + if not subwords or (do_cutting and (len(subwords) > max_subword_len)): tokens_subword.append('[UNK]') - mask_subword.append(flag) + startofword_markers.append(token_marker) tags_subword.append(tag) else: - if mode == 'train' and token_masking_prob > 0.0 and np.random.rand() < token_masking_prob: + if do_masking and (random.random() < token_masking_prob): tokens_subword.extend(['[MASK]'] * len(subwords)) else: tokens_subword.extend(subwords) if subword_mask_mode == "last": - mask_subword.extend([0] * (len(subwords) - 1) + [flag]) + startofword_markers.extend([0] * (len(subwords) - 1) + [token_marker]) else: - mask_subword.extend([flag] + [0] * (len(subwords) - 1)) + startofword_markers.extend([token_marker] + [0] * (len(subwords) - 1)) tags_subword.extend([tag] + ['X'] * (len(subwords) - 1)) tokens_subword.append('[SEP]') - mask_subword.append(0) + startofword_markers.append(0) tags_subword.append('X') - return tokens_subword, mask_subword, tags_subword + return tokens_subword, startofword_markers, tags_subword @register('bert_ranker_preprocessor') diff --git a/deeppavlov/models/preprocessors/transformers_preprocessor.py b/deeppavlov/models/preprocessors/transformers_preprocessor.py new file mode 100644 index 0000000000..9bbfabea9e --- /dev/null +++ b/deeppavlov/models/preprocessors/transformers_preprocessor.py @@ -0,0 +1,76 @@ +# Copyright 2020 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from logging import getLogger +from typing import List, Union, Tuple + +import numpy as np +from transformers import BertTokenizer + +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component + +log = getLogger(__name__) + + +def _pad(data: List[List[Union[int, float]]], value: Union[int, float] = 0): + max_len = max(map(len, data)) + res = np.ones([len(data), max_len], dtype=type(value)) * value + for i, item in enumerate(data): + res[i][:len(item)] = item + return res + + +@register('transformers_bert_preprocessor') +class TransformersBertPreprocessor(Component): + def __init__(self, vocab_file: str, + do_lower_case: bool = False, + max_seq_length: int = 512, + tokenize_chinese_chars: bool = True, + **kwargs): + vocab_file = expand_path(vocab_file) + self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, + tokenize_chinese_chars=tokenize_chinese_chars) + self.max_seq_length = max_seq_length + + def __call__(self, tokens_batch: Union[List[str], List[List[str]]]) ->\ + Tuple[List[List[str]], List[List[str]], np.ndarray, np.ndarray, np.ndarray]: + + if isinstance(tokens_batch[0], str): # skip for already tokenized text + tokens_batch = [self.tokenizer.basic_tokenizer.tokenize(sentence, self.tokenizer.all_special_tokens) + for sentence in tokens_batch] + startofword_markers_batch = [] + subtokens_batch = [] + for tokens in tokens_batch: + startofword_markers = [0] + subtokens = ['[CLS]'] + for token in tokens: + for i, subtoken in enumerate(self.tokenizer.wordpiece_tokenizer.tokenize(token)): + startofword_markers.append(int(i == 0)) + subtokens.append(subtoken) + startofword_markers.append(0) + subtokens.append('[SEP]') + if len(subtokens) > self.max_seq_length: + raise RuntimeError(f"input sequence after bert tokenization" + f" cannot exceed {self.max_seq_length} tokens.") + + startofword_markers_batch.append(startofword_markers) + subtokens_batch.append(subtokens) + + encoded = self.tokenizer.batch_encode_plus([[subtokens, None] for subtokens in subtokens_batch], + add_special_tokens=False) + + return (tokens_batch, subtokens_batch, + _pad(encoded['input_ids'], value=self.tokenizer.pad_token_id), + _pad(startofword_markers_batch), _pad(encoded['attention_mask'])) diff --git a/deeppavlov/requirements/pytorch.txt b/deeppavlov/requirements/pytorch.txt new file mode 100644 index 0000000000..7657e148b1 --- /dev/null +++ b/deeppavlov/requirements/pytorch.txt @@ -0,0 +1 @@ +torch==1.4.0 \ No newline at end of file diff --git a/deeppavlov/requirements/tf-gpu.txt b/deeppavlov/requirements/tf-gpu.txt index 3a4ce9cac1..d97315f1cc 100644 --- a/deeppavlov/requirements/tf-gpu.txt +++ b/deeppavlov/requirements/tf-gpu.txt @@ -1 +1 @@ -tensorflow-gpu==1.15.0 \ No newline at end of file +tensorflow-gpu==1.15.2 \ No newline at end of file diff --git a/deeppavlov/requirements/tf.txt b/deeppavlov/requirements/tf.txt index 504887d126..26ff9379f8 100644 --- a/deeppavlov/requirements/tf.txt +++ b/deeppavlov/requirements/tf.txt @@ -1 +1 @@ -tensorflow==1.15.0 \ No newline at end of file +tensorflow==1.15.2 \ No newline at end of file diff --git a/deeppavlov/requirements/transformers.txt b/deeppavlov/requirements/transformers.txt new file mode 100644 index 0000000000..0969a5a702 --- /dev/null +++ b/deeppavlov/requirements/transformers.txt @@ -0,0 +1 @@ +transformers==2.5.0 \ No newline at end of file diff --git a/docs/apiref/models/embedders.rst b/docs/apiref/models/embedders.rst index 329de7d71f..b004dfa006 100644 --- a/docs/apiref/models/embedders.rst +++ b/docs/apiref/models/embedders.rst @@ -20,4 +20,8 @@ deeppavlov.models.embedders .. autoclass:: deeppavlov.models.embedders.tfidf_weighted_embedder.TfidfWeightedEmbedder + .. automethod:: __call__ + +.. autoclass:: deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder + .. automethod:: __call__ \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 5b861fe5b5..f867223fa3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -190,8 +190,9 @@ # -- Extension configuration ------------------------------------------------- -autodoc_mock_imports = ['tensorflow', 'tensorflow_hub', 'fastText', 'nltk', 'gensim', 'kenlm', 'spacy', 'lxml', - 'sortedcontainers', 'russian_tagsets', 'bert_dp', 'aiml', 'rasa', 'fasttext', 'sacremoses'] +autodoc_mock_imports = ['tensorflow', 'tensorflow_hub', 'fastText', 'nltk', 'gensim', 'kenlm', 'spacy', 'lxml', 'torch', + 'sortedcontainers', 'russian_tagsets', 'bert_dp', 'aiml', 'rasa', 'fasttext', 'sacremoses', + 'transformers'] extlinks = { 'config': (f'https://github.com/deepmipt/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None) diff --git a/docs/features/models/bert.rst b/docs/features/models/bert.rst index e3e8c7b57f..3e27482b49 100644 --- a/docs/features/models/bert.rst +++ b/docs/features/models/bert.rst @@ -7,37 +7,86 @@ English. | BERT paper: https://arxiv.org/abs/1810.04805 | Google Research BERT repository: https://github.com/google-research/bert -There are several pre-trained BERT models released by Google Research, more detail about these pretrained models could be found here https://github.com/google-research/bert#pre-trained-models: +There are several pre-trained BERT models released by Google Research, more detail about these pretrained models could be found here: https://github.com/google-research/bert#pre-trained-models -- BERT-base, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] `__, `[deeppavlov] `__ -- BERT-base, English, uncased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] `__, `[deeppavlov] `__ +- BERT-base, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] `__, + `[deeppavlov] `__ +- BERT-base, English, uncased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] `__, + `[deeppavlov] `__ - BERT-large, English, cased, 24-layer, 1024-hidden, 16-heads, 340M parameters: download from `[google] `__ -- BERT-base, multilingual, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: download from `[google] `__, `[deeppavlov] `__ -- BERT-base, Chinese, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] `__ +- BERT-base, multilingual, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: download from `[google] `__, + `[deeppavlov] `__, `[deeppavlov_pytorch] `__ +- BERT-base, Chinese, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] `__, + `[deeppavlov] `__, `[deeppavlov_pytorch] `__ We have trained BERT-base model for other languages and domains: -- RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__ -- SlavicBERT, Slavic (bg, cs, pl, ru), cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__ -- Conversational BERT, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: `[deeppavlov] `__ -- Conversational RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__ +- RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__, + `[deeppavlov_pytorch] `__ +- SlavicBERT, Slavic (bg, cs, pl, ru), cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__, + `[deeppavlov_pytorch] `__ +- Conversational BERT, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: `[deeppavlov] `__, + `[deeppavlov_pytorch] `__ +- Conversational RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__, + `[deeppavlov_pytorch] `__ +- Sentence Multilingual BERT, 101 languages, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__, + `[deeppavlov_pytorch] `__ +- Sentence RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__, + `[deeppavlov_pytorch] `__ + +The ``deeppavlov_pytorch`` models are designed to be run with the `HuggingFace's Transformers `__ library. RuBERT was trained on the Russian part of Wikipedia and news data. We used this training data to build vocabulary of Russian subtokens and took multilingual version of BERT-base as initialization for RuBERT [1]_. SlavicBERT was trained on Russian News and four Wikipedias: Bulgarian, Czech, Polish, and Russian. Subtoken vocabulary was built using this data. Multilingual BERT was used as an initialization for SlavicBERT. +The model is described in our ACL paper [2]_. -Conversational BERT was trained on the English part of Twitter, Reddit, DailyDialogues [3]_, OpenSubtitles [4]_, Debates [5]_, Blogs [6]_, Facebook News Comments. +Conversational BERT was trained on the English part of Twitter, Reddit, DailyDialogues [4]_, OpenSubtitles [5]_, Debates [6]_, Blogs [7]_, Facebook News Comments. We used this training data to build the vocabulary of English subtokens and took English cased version of BERT-base as initialization for English Conversational BERT. -Conversational RuBERT was trained on OpenSubtitles [4]_, Dirty, Pikabu, and Social Media segment of Taiga corpus [7]_. +Conversational RuBERT was trained on OpenSubtitles [5]_, Dirty, Pikabu, and Social Media segment of Taiga corpus [8]_. We assembled new vocabulary for Conversational RuBERT model on this data and initialized model with RuBERT. +Sentence Multilingual BERT is a representation-based sentence encoder for 101 languages of Multilingual BERT. +It is initialized with Multilingual BERT and then fine-tuned on english MultiNLI [9]_ and on dev set of multilingual XNLI [10]_. +Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT [12]_. + +Sentence RuBERT is a representation-based sentence encoder for Russian. +It is initialized with RuBERT and fine-tuned on SNLI [11]_ google-translated to russian and on russian part of XNLI dev set [10]_. +Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT [12]_. + Here, in DeepPavlov, we made it easy to use pre-trained BERT for downstream tasks like classification, tagging, question answering and ranking. We also provide pre-trained models and examples on how to use BERT with DeepPavlov. +BERT as Embedder +---------------- + +:class:`~deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder` allows for using BERT +model outputs as token, subtoken and sentence level embeddings. + +Additionaly the embeddings can be easily used in DeepPavlov. To get text level, token level and subtoken level representations, +you can use or modify a :config:`BERT embedder configuration `: + +.. code:: python + + from deeppavlov.core.common.file import read_json + from deeppavlov import build_model, configs + + bert_config = read_json(configs.embedder.bert_embedder) + bert_config['metadata']['variables']['BERT_PATH'] = 'path/to/bert/directory' + + m = build_model(bert_config) + + texts = ['Hi, i want my embedding.', 'And mine too, please!'] + tokens, token_embs, subtokens, subtoken_embs, sent_max_embs, sent_mean_embs, bert_pooler_outputs = m(texts) + +Examples of using these embeddings in model training pipelines can be found in :config:`Sentiment Twitter ` +and :config:`NER Ontonotes ` configuration files. + + BERT for Classification ----------------------- @@ -96,7 +145,7 @@ transformations to predict probability that current subtoken is start/end positi BERT for Ranking ---------------- There are two main approaches in text ranking. The first one is interaction-based which is relatively accurate but -works slow and the second one is representation-based which is less accurate but faster [2]_. +works slow and the second one is representation-based which is less accurate but faster [3]_. The interaction-based ranking based on BERT is represented in the DeepPavlov with two main components :class:`~deeppavlov.models.preprocessors.bert_preprocessor.BertRankerPreprocessor` and :class:`~deeppavlov.models.bert.bert_ranker.BertRankerModel` @@ -134,9 +183,14 @@ the :doc:`config ` file must be changed to match new BERT * ``vocab_file`` in the ``bert_preprocessor`` .. [1] Kuratov, Y., Arkhipov, M. (2019). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint arXiv:1905.07213. -.. [2] McDonald, R., Brokos, G. I., & Androutsopoulos, I. (2018). Deep relevance ranking using enhanced document-query interactions. arXiv preprint arXiv:1809.01682. -.. [3] Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. IJCNLP 2017. -.. [4] P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016) -.. [5] Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016. -.. [6] J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006). Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs. -.. [7] Shavrina T., Shapovalova O. (2017) TO THE METHODOLOGY OF CORPUS CONSTRUCTION FOR MACHINE LEARNING: «TAIGA» SYNTAX TREE CORPUS AND PARSER. in proc. of “CORPORA2017”, international conference , Saint-Petersbourg, 2017. +.. [2] Arkhipov M., Trofimova M., Kuratov Y., Sorokin A. (2019). `Tuning Multilingual Transformers for Language-Specific Named Entity Recognition `__ . ACL anthology W19-3712. +.. [3] McDonald, R., Brokos, G. I., & Androutsopoulos, I. (2018). Deep relevance ranking using enhanced document-query interactions. arXiv preprint arXiv:1809.01682. +.. [4] Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. IJCNLP 2017. +.. [5] P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016) +.. [6] Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016. +.. [7] J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006). Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs. +.. [8] Shavrina T., Shapovalova O. (2017) TO THE METHODOLOGY OF CORPUS CONSTRUCTION FOR MACHINE LEARNING: «TAIGA» SYNTAX TREE CORPUS AND PARSER. in proc. of “CORPORA2017”, international conference , Saint-Petersbourg, 2017. +.. [9] Williams A., Nangia N. & Bowman S. (2017) A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference. arXiv preprint arXiv:1704.05426 +.. [10] Williams A., Bowman S. (2018) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint arXiv:1809.05053 +.. [11] S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. (2015) A large annotated corpus for learning natural language inference. arXiv preprint arXiv:1508.05326 +.. [12] N. Reimers, I. Gurevych (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint arXiv:1908.10084 diff --git a/docs/features/pretrained_vectors.rst b/docs/features/pretrained_vectors.rst index 1bb4b9477f..ee8d6d01e0 100644 --- a/docs/features/pretrained_vectors.rst +++ b/docs/features/pretrained_vectors.rst @@ -9,7 +9,9 @@ We are publishing several pre-trained BERT models: * RuBERT for Russian language * Slavic BERT for Bulgarian, Czech, Polish, and Russian * Conversational BERT for informal English -* and Conversational BERT for informal Russian +* Conversational BERT for informal Russian +* Sentence Multilingual BERT for encoding sentences in 101 languages +* Sentence RuBERT for encoding sentences in Russian Description of these models is available in the :doc:`BERT section ` of the docs. @@ -22,19 +24,32 @@ The pre-trained models are distributed under the `License Apache Downloads ~~~~~~~~~ -The models can be run with the original `BERT repo `_ code. The download links are: +The ``TensorFlow`` models can be run with the original `BERT repo `_ code +while the ``PyTorch`` models can be run with the `HuggingFace's Transformers `__ library. +The download links are: + ++----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+ +| Description | Model parameters | Download links | ++============================+=======================================+====================================================================================================================+ +| RuBERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, | +| | size = 632MB | `[pytorch] `__ | ++----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+ +| Slavic BERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, | +| | size = 632MB | `[pytorch] `__ | ++----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+ +| Conversational BERT | vocab size = 30K, parameters = 110M, | `[tensorflow] `__, | +| | size = 385MB | `[pytorch] `__ | ++----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+ +| Conversational RuBERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, | +| | size = 630MB | `[pytorch] `__ | ++----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+ +| Sentence Multilingual BERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, | +| | size = 630MB | `[pytorch] `__ | ++----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+ +| Sentence RuBERT | vocab size = 120K, parameters = 180M, | `[tensorflow] `__, | +| | size = 630MB | `[pytorch] `__ | ++----------------------------+---------------------------------------+--------------------------------------------------------------------------------------------------------------------+ -+------------------------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ -| Description | Model parameters | Download link | -+========================+====================================================+==================================================================================================================================================+ -| RuBERT | vocab size = 120K, parameters = 180M, size = 632MB | `[rubert_cased_L-12_H-768_A-12] `__ | -+------------------------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ -| Slavic BERT | vocab size = 120K, parameters = 180M, size = 632MB | `[bg_cs_pl_ru_cased_L-12_H-768_A-12] `__ | -+------------------------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ -| Conversational BERT | vocab size = 30K, parameters = 110M, size = 385MB | `[conversational_cased_L-12_H-768_A-12] `__ | -+------------------------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ -| Conversational RuBERT | vocab size = 120K, parameters = 180M, size = 630MB | `[conversational_cased_L-12_H-768_A-12] `__ | -+------------------------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ ELMo ---- @@ -55,15 +70,15 @@ Downloads The models can be downloaded and run by configuration file or tensorflow hub module from: -+--------------------------------------------------------------------+---------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Description | Dataset parameters | Perplexity | Configuration file and tensorflow hub module | -+====================================================================+=============================================+==================+============================================================================================================================================================================================================================================+ -| ELMo on `Russian Wikipedia `__ | lines = 1M, tokens = 386M, size = 5GB | 43.692 | `config_file `__, `module_spec `__ | -+--------------------------------------------------------------------+---------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ELMo on `Russian WMT News `__ | lines = 63M, tokens = 946M, size = 12GB | 49.876 | `config_file `__, `module_spec `__ | -+--------------------------------------------------------------------+---------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ELMo on `Russian Twitter `__ | lines = 104M, tokens = 810M, size = 8.5GB | 94.145 | `config_file `__, `module_spec `__ | -+--------------------------------------------------------------------+---------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Description | Dataset parameters | Perplexity | Configuration file and tensorflow hub module | ++====================================================================+=============================================+==================+=======================================================================================================================================================================================================================================+ +| ELMo on `Russian Wikipedia `__ | lines = 1M, tokens = 386M, size = 5GB | 43.692 | `config_file `__, `module_spec `__ | ++--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ELMo on `Russian WMT News `__ | lines = 63M, tokens = 946M, size = 12GB | 49.876 | `config_file `__, `module_spec `__ | ++--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ELMo on `Russian Twitter `__ | lines = 104M, tokens = 810M, size = 8.5GB | 94.145 | `config_file `__, `module_spec `__ | ++--------------------------------------------------------------------+---------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ fastText -------- diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index 2a5d37d1a7..5f283b8d87 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -98,6 +98,7 @@ ("classifiers/intents_dstc2_big.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/insults_kaggle.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/sentiment_twitter.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/sentiment_twitter_bert_emb.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/sentiment_twitter_preproc.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/topic_ag_news.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/rusentiment_cnn.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], @@ -142,6 +143,7 @@ ("ner/ner_conll2003.json", "ner_conll2003", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_dstc2.json", "slotfill_dstc2", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_ontonotes.json", "ner_ontonotes", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], + ("ner/ner_ontonotes_bert_emb.json", "ner_ontonotes_bert_emb", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_few_shot_ru_simulate.json", "ner_fs", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_rus.json", "ner_rus", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], ("ner/slotfill_dstc2.json", "slotfill_dstc2", ('IP',)): @@ -155,7 +157,7 @@ ("kbqa/kbqa_rus.json", "kbqa", ('IP',)): [ONE_ARGUMENT_INFER_CHECK] }, "elmo_embedder": { - ("elmo_embedder/elmo_ru_news.json", "elmo_embedder_ru_news", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], + ("embedder/elmo_ru_news.json", "embedder_ru_news", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], }, "elmo_model": { ("elmo/elmo_1b_benchmark_test.json", "elmo_1b_benchmark_test", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], From 7c4f4df2e58bf0a0470cd047723f5d41f8013516 Mon Sep 17 00:00:00 2001 From: Aleksei Lymar Date: Wed, 26 Feb 2020 12:37:28 +0300 Subject: [PATCH 15/15] docs: apply suggested fixes Co-Authored-By: Fedor Ignatov --- deeppavlov/models/embedders/transformers_embedder.py | 2 +- docs/features/models/bert.rst | 2 +- docs/features/models/squad.rst | 3 +-- docs/features/models/syntaxparser.rst | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/deeppavlov/models/embedders/transformers_embedder.py b/deeppavlov/models/embedders/transformers_embedder.py index 6a1fd40d6c..7127d5ab86 100644 --- a/deeppavlov/models/embedders/transformers_embedder.py +++ b/deeppavlov/models/embedders/transformers_embedder.py @@ -63,7 +63,7 @@ def __call__(self, subtoken_ids_batch: Collection[Collection[int]], startofwords Args: subtoken_ids_batch: padded indexes for every subtoken startofwords_batch: a mask matrix with ``1`` for every first subtoken init in a token and ``0`` - for every other subtoken + for every other subtoken attention_batch: a mask matrix with ``1`` for every significant subtoken and ``0`` for paddings """ ids_tensor = torch.tensor(subtoken_ids_batch, device=self.device) diff --git a/docs/features/models/bert.rst b/docs/features/models/bert.rst index 3e27482b49..e90af2c6fd 100644 --- a/docs/features/models/bert.rst +++ b/docs/features/models/bert.rst @@ -7,7 +7,7 @@ English. | BERT paper: https://arxiv.org/abs/1810.04805 | Google Research BERT repository: https://github.com/google-research/bert -There are several pre-trained BERT models released by Google Research, more detail about these pretrained models could be found here: https://github.com/google-research/bert#pre-trained-models +There are several pre-trained BERT models released by Google Research, more details about these pre-trained models could be found here: https://github.com/google-research/bert#pre-trained-models - BERT-base, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] `__, `[deeppavlov] `__ diff --git a/docs/features/models/squad.rst b/docs/features/models/squad.rst index 5d58394875..995ada56ac 100644 --- a/docs/features/models/squad.rst +++ b/docs/features/models/squad.rst @@ -228,8 +228,7 @@ Pretrained models are available and can be downloaded: .. code:: bash python -m deeppavlov download deeppavlov/configs/squad/squad_zh_bert.json - - python -m deeppavlov download deeppavlov/configs/squad/squad_zh_zh_bert.json + python -m deeppavlov download deeppavlov/configs/squad/squad_zh_zh_bert.json Link to DRCD dataset: http://files.deeppavlov.ai/datasets/DRCD.tar.gz Link to DRCD paper: https://arxiv.org/abs/1806.00920 diff --git a/docs/features/models/syntaxparser.rst b/docs/features/models/syntaxparser.rst index 7e3edffea0..b08ce2ffb7 100644 --- a/docs/features/models/syntaxparser.rst +++ b/docs/features/models/syntaxparser.rst @@ -167,4 +167,4 @@ and dependency head. .. _`UD Pipe Future`: https://github.com/CoNLL-UD-2018/UDPipe-Future .. _`UDify (multilingual BERT)`: https://github.com/hyperparticle/udify -So our model is by a valuable margin the state-of-the-art system for Russian syntactic parsing. +So our model is the state-of-the-art system for Russian syntactic parsing by a valuable margin.