From 21b70f0c403486029beddd49b8a22deaa5a46743 Mon Sep 17 00:00:00 2001
From: Fedor Ignatov <ignatov.fedor@gmail.com>
Date: Wed, 20 Oct 2021 13:12:48 +0300
Subject: [PATCH 1/3] refactor: Remove deeppavlov.configs.elmo (#1498)

* removed elmo config files

* removed elmo_file_paths_iterator, elmo_model and file_paths_reader

* refactor: returned file_paths_reader

* docs: newlines in file_paths_reader docstring
---
 .../configs/elmo/elmo_1b_benchmark.json       |  81 --
 .../configs/elmo/elmo_1b_benchmark_test.json  |  79 --
 .../elmo_lm_ready4fine_tuning_ru_news.json    |  83 --
 ...o_lm_ready4fine_tuning_ru_news_simple.json |  83 --
 .../elmo_lm_ready4fine_tuning_ru_twitter.json |  83 --
 ...m_ready4fine_tuning_ru_twitter_simple.json |  83 --
 .../elmo_lm_ready4fine_tuning_ru_wiki.json    |  83 --
 ...o_lm_ready4fine_tuning_ru_wiki_simple.json |  83 --
 .../elmo/elmo_paraphraser_fine_tuning.json    |  84 --
 deeppavlov/core/common/registry.json          |   2 -
 .../core/common/requirements_registry.json    |   4 -
 .../elmo_file_paths_iterator.py               | 154 ----
 deeppavlov/models/elmo/__init__.py            |   0
 deeppavlov/models/elmo/bilm_model.py          | 510 ------------
 deeppavlov/models/elmo/elmo.py                | 601 --------------
 deeppavlov/models/elmo/elmo2tfhub.py          | 208 -----
 deeppavlov/models/elmo/elmo_model.py          | 730 ------------------
 deeppavlov/models/elmo/train_utils.py         | 244 ------
 docs/apiref/dataset_iterators.rst             |   2 -
 docs/apiref/models/elmo.rst                   |   6 -
 docs/features/models/neural_ranking.rst       |  10 -
 tests/test_quick_start.py                     |   3 -
 22 files changed, 3216 deletions(-)
 delete mode 100644 deeppavlov/configs/elmo/elmo_1b_benchmark.json
 delete mode 100644 deeppavlov/configs/elmo/elmo_1b_benchmark_test.json
 delete mode 100644 deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_news.json
 delete mode 100644 deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_news_simple.json
 delete mode 100644 deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_twitter.json
 delete mode 100644 deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_twitter_simple.json
 delete mode 100644 deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_wiki.json
 delete mode 100644 deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_wiki_simple.json
 delete mode 100644 deeppavlov/configs/elmo/elmo_paraphraser_fine_tuning.json
 delete mode 100644 deeppavlov/dataset_iterators/elmo_file_paths_iterator.py
 delete mode 100644 deeppavlov/models/elmo/__init__.py
 delete mode 100644 deeppavlov/models/elmo/bilm_model.py
 delete mode 100644 deeppavlov/models/elmo/elmo.py
 delete mode 100644 deeppavlov/models/elmo/elmo2tfhub.py
 delete mode 100644 deeppavlov/models/elmo/elmo_model.py
 delete mode 100644 deeppavlov/models/elmo/train_utils.py
 delete mode 100644 docs/apiref/models/elmo.rst
diff --git a/deeppavlov/configs/elmo/elmo_1b_benchmark.json b/deeppavlov/configs/elmo/elmo_1b_benchmark.json
deleted file mode 100644
index 806272b771..0000000000
--- a/deeppavlov/configs/elmo/elmo_1b_benchmark.json
+++ /dev/null
@@ -1,81 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "file_paths_reader",
-    "data_path": "{DOWNLOADS_PATH}/elmo-1b-benchmark/data/1-billion-word-language-modeling-benchmark-r13output/",
-    "train": "training-monolingual.tokenized.shuffled/*"
-  },
-  "dataset_iterator": {
-    "class_name": "elmo_file_paths_iterator",
-    "seed": 31415,
-    "unroll_steps": 20,
-    "max_word_length": 50,
-    "n_gpus": 1,
-    "shuffle": false,
-    "bos": "<S>",
-    "eos": "</S>",
-    "save_path": "{MODELS_PATH}/elmo-1b-benchmark/vocab-2016-09-10.txt",
-    "load_path": "{MODELS_PATH}/elmo-1b-benchmark/vocab-2016-09-10.txt"
-  },
-  "chainer": {
-    "in": [
-      "x_char_ids"
-    ],
-    "in_y": [
-      "y_token_ids"
-    ],
-    "pipe": [
-      {
-        "class_name": "elmo_model",
-        "options_json_path": "{MODELS_PATH}/elmo-1b-benchmark/options.json",
-        "unroll_steps": 20,
-        "batch_size": 128,
-        "save_path": "{MODELS_PATH}/elmo-1b-benchmark/saves/model",
-        "load_path": "{MODELS_PATH}/elmo-1b-benchmark/saves/model",
-        "in": ["x_char_ids", "y_token_ids"],
-        "in_y": [],
-        "n_gpus": 1,
-        "out": ["loss"]
-      }
-    ],
-    "out": [
-      "x_char_ids"
-    ]
-  },
-  "train": {
-    "epochs": 20,
-    "batch_size": 128,
-    "log_every_n_batches": 100,
-    "val_every_n_epochs": 1,
-    "validation_patience": 4,
-    "metric_optimization": "minimize",
-    "metrics": [
-      {
-        "name": "elmo_loss2ppl",
-        "inputs": ["loss"]
-      }
-    ],
-    "tensorboard_log_dir": "{MODELS_PATH}/elmo-1b-benchmark/logs",
-    "class_name": "nn_trainer",
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/1-billion-word-language-modeling-benchmark-r13output.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}/elmo-1b-benchmark/data"
-      },
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/original_elmo_configuration_and_vocab.tar.gz",
-        "subdir": "{MODELS_PATH}/elmo-1b-benchmark"
-      }
-    ]
-  }
-}
diff --git a/deeppavlov/configs/elmo/elmo_1b_benchmark_test.json b/deeppavlov/configs/elmo/elmo_1b_benchmark_test.json
deleted file mode 100644
index 15af5b02ae..0000000000
--- a/deeppavlov/configs/elmo/elmo_1b_benchmark_test.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "file_paths_reader",
-    "data_path": "{DOWNLOADS_PATH}/elmo-1b-benchmark_test/data/1-billion-word-language-modeling-benchmark-r13output/",
-    "train": "heldout-monolingual.tokenized.shuffled/news.en.heldout-00001-of-00050",
-    "test": "heldout-monolingual.tokenized.shuffled/news.en.heldout-00002-of-00050",
-    "valid": "heldout-monolingual.tokenized.shuffled/news.en.heldout-00003-of-00050"
-  },
-  "dataset_iterator": {
-    "class_name": "elmo_file_paths_iterator",
-    "seed": 31415,
-    "unroll_steps": 20,
-    "max_word_length": 50,
-    "n_gpus": 1,
-    "shuffle": false,
-    "bos": "<S>",
-    "eos": "</S>",
-    "save_path": "{DOWNLOADS_PATH}/elmo-1b-benchmark_test/data/vocab-2016-09-10.txt",
-    "load_path": "{DOWNLOADS_PATH}/elmo-1b-benchmark_test/data/vocab-2016-09-10.txt"
-  },
-  "chainer": {
-    "in": [
-      "x_char_ids"
-    ],
-    "in_y": [
-      "y_token_ids"
-    ],
-    "pipe": [
-      {
-        "class_name": "elmo_model",
-        "options_json_path": "{DOWNLOADS_PATH}/elmo-1b-benchmark_test/options.json",
-        "unroll_steps": 20,
-        "batch_size": 128,
-        "save_path": "{MODELS_PATH}/elmo-1b-benchmark_test/saves/model",
-        "load_path": "{MODELS_PATH}/elmo-1b-benchmark_test/saves/model",
-        "in": ["x_char_ids", "y_token_ids"],
-        "in_y": [],
-        "n_gpus": 1,
-        "out": ["loss"]
-      }
-    ],
-    "out": [
-      "x_char_ids"
-    ]
-  },
-  "train": {
-    "epochs": 2,
-    "batch_size": 128,
-    "log_every_n_batches": 5,
-    "val_every_n_epochs": 1,
-    "validation_patience": 4,
-    "metric_optimization": "minimize",
-    "metrics": [
-      {
-        "name": "elmo_loss2ppl",
-        "inputs": ["loss"]
-      }
-    ],
-    "tensorboard_log_dir": "{MODELS_PATH}/elmo-1b-benchmark_test/logs",
-    "class_name": "nn_trainer",
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-1b-benchmark_test.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}"
-      }
-    ]
-  }
-}
diff --git a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_news.json b/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_news.json
deleted file mode 100644
index ecaa2afb39..0000000000
--- a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_news.json
+++ /dev/null
@@ -1,83 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "file_paths_reader",
-    "data_path": "{DOWNLOADS_PATH}/elmo-lm-ready4fine-example-data/data/",
-    "train": "train/*",
-    "valid": "heldout/*"
-  },
-  "dataset_iterator": {
-    "class_name": "elmo_file_paths_iterator",
-    "seed": 31415,
-    "unroll_steps": 20,
-    "max_word_length": 50,
-    "n_gpus": 1,
-    "shuffle": false,
-    "bos": "<S>",
-    "eos": "</S>",
-    "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/vocab.txt",
-    "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/vocab.txt"
-  },
-  "chainer": {
-    "in": [
-      "x_char_ids"
-    ],
-    "in_y": [
-      "y_token_ids"
-    ],
-    "pipe": [
-      {
-        "class_name": "elmo_model",
-        "options_json_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/options.json",
-        "unroll_steps": 20,
-        "batch_size": 128,
-        "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/saves/model",
-        "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/saves/model",
-        "in": ["x_char_ids", "y_token_ids"],
-        "in_y": [],
-        "n_gpus": 1,
-        "out": ["loss"]
-      }
-    ],
-    "out": [
-      "x_char_ids",
-      "y_token_ids"
-    ]
-  },
-  "train": {
-    "epochs": 20,
-    "batch_size": 128,
-    "log_every_n_batches": 100,
-    "val_every_n_epochs": 1,
-    "validation_patience": 4,
-    "metric_optimization": "minimize",
-    "metrics": [
-      {
-        "name": "elmo_loss2ppl",
-        "inputs": ["loss"]
-      }
-    ],
-    "tensorboard_log_dir": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/logs",
-    "class_name": "nn_trainer",
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-example-data.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}/"
-      },
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-tuning-ru-news.tar.gz",
-        "subdir": "{MODELS_PATH}/"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_news_simple.json b/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_news_simple.json
deleted file mode 100644
index f7a95f1238..0000000000
--- a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_news_simple.json
+++ /dev/null
@@ -1,83 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "file_paths_reader",
-    "data_path": "{DOWNLOADS_PATH}/elmo-lm-ready4fine-example-data/data/",
-    "train": "train/*",
-    "valid": "heldout/*"
-  },
-  "dataset_iterator": {
-    "class_name": "elmo_file_paths_iterator",
-    "seed": 31415,
-    "unroll_steps": 20,
-    "max_word_length": 50,
-    "n_gpus": 1,
-    "shuffle": false,
-    "bos": "<S>",
-    "eos": "</S>",
-    "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/vocab.txt",
-    "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/vocab.txt"
-  },
-  "chainer": {
-    "in": [
-      "x_char_ids"
-    ],
-    "in_y": [
-      "y_token_ids"
-    ],
-    "pipe": [
-      {
-        "class_name": "elmo_model",
-        "options_json_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/options.json",
-        "unroll_steps": 20,
-        "batch_size": 128,
-        "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/saves/model",
-        "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/saves/model",
-        "in": ["x_char_ids", "y_token_ids"],
-        "in_y": [],
-        "n_gpus": 1,
-        "out": ["loss"]
-      }
-    ],
-    "out": [
-      "x_char_ids",
-      "y_token_ids"
-    ]
-  },
-  "train": {
-    "epochs": 20,
-    "batch_size": 128,
-    "log_every_n_batches": 100,
-    "val_every_n_epochs": 1,
-    "validation_patience": 4,
-    "metric_optimization": "minimize",
-    "metrics": [
-      {
-        "name": "elmo_loss2ppl",
-        "inputs": ["loss"]
-      }
-    ],
-    "tensorboard_log_dir": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/logs",
-    "class_name": "nn_trainer",
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-example-data.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}/"
-      },
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-tuning-ru-news-simple.tar.gz",
-        "subdir": "{MODELS_PATH}/"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_twitter.json b/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_twitter.json
deleted file mode 100644
index 9a4a2f9007..0000000000
--- a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_twitter.json
+++ /dev/null
@@ -1,83 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "file_paths_reader",
-    "data_path": "{DOWNLOADS_PATH}/elmo-lm-ready4fine-example-data/data/",
-    "train": "train/*",
-    "valid": "heldout/*"
-  },
-  "dataset_iterator": {
-    "class_name": "elmo_file_paths_iterator",
-    "seed": 31415,
-    "unroll_steps": 20,
-    "max_word_length": 50,
-    "n_gpus": 1,
-    "shuffle": false,
-    "bos": "<S>",
-    "eos": "</S>",
-    "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter/vocab.txt",
-    "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter/vocab.txt"
-  },
-  "chainer": {
-    "in": [
-      "x_char_ids"
-    ],
-    "in_y": [
-      "y_token_ids"
-    ],
-    "pipe": [
-      {
-        "class_name": "elmo_model",
-        "options_json_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter/options.json",
-        "unroll_steps": 20,
-        "batch_size": 128,
-        "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter/saves/model",
-        "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter/saves/model",
-        "in": ["x_char_ids", "y_token_ids"],
-        "in_y": [],
-        "n_gpus": 1,
-        "out": ["loss"]
-      }
-    ],
-    "out": [
-      "x_char_ids",
-      "y_token_ids"
-    ]
-  },
-  "train": {
-    "epochs": 20,
-    "batch_size": 128,
-    "log_every_n_batches": 100,
-    "val_every_n_epochs": 1,
-    "validation_patience": 4,
-    "metric_optimization": "minimize",
-    "metrics": [
-      {
-        "name": "elmo_loss2ppl",
-        "inputs": ["loss"]
-      }
-    ],
-    "tensorboard_log_dir": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter/logs",
-    "class_name": "nn_trainer",
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-example-data.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}/"
-      },
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-tuning-ru-twitter.tar.gz",
-        "subdir": "{MODELS_PATH}/"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_twitter_simple.json b/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_twitter_simple.json
deleted file mode 100644
index 6ffd491f07..0000000000
--- a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_twitter_simple.json
+++ /dev/null
@@ -1,83 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "file_paths_reader",
-    "data_path": "{DOWNLOADS_PATH}/elmo-lm-ready4fine-example-data/data/",
-    "train": "train/*",
-    "valid": "heldout/*"
-  },
-  "dataset_iterator": {
-    "class_name": "elmo_file_paths_iterator",
-    "seed": 31415,
-    "unroll_steps": 20,
-    "max_word_length": 50,
-    "n_gpus": 1,
-    "shuffle": false,
-    "bos": "<S>",
-    "eos": "</S>",
-    "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter-simple/vocab.txt",
-    "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter-simple/vocab.txt"
-  },
-  "chainer": {
-    "in": [
-      "x_char_ids"
-    ],
-    "in_y": [
-      "y_token_ids"
-    ],
-    "pipe": [
-      {
-        "class_name": "elmo_model",
-        "options_json_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter-simple/options.json",
-        "unroll_steps": 20,
-        "batch_size": 128,
-        "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter-simple/saves/model",
-        "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter-simple/saves/model",
-        "in": ["x_char_ids", "y_token_ids"],
-        "in_y": [],
-        "n_gpus": 1,
-        "out": ["loss"]
-      }
-    ],
-    "out": [
-      "x_char_ids",
-      "y_token_ids"
-    ]
-  },
-  "train": {
-    "epochs": 20,
-    "batch_size": 128,
-    "log_every_n_batches": 100,
-    "val_every_n_epochs": 1,
-    "validation_patience": 4,
-    "metric_optimization": "minimize",
-    "metrics": [
-      {
-        "name": "elmo_loss2ppl",
-        "inputs": ["loss"]
-      }
-    ],
-    "tensorboard_log_dir": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-twitter-simple/logs",
-    "class_name": "nn_trainer",
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-example-data.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}/"
-      },
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-tuning-ru-twitter-simple.tar.gz",
-        "subdir": "{MODELS_PATH}/"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_wiki.json b/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_wiki.json
deleted file mode 100644
index c44e850215..0000000000
--- a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_wiki.json
+++ /dev/null
@@ -1,83 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "file_paths_reader",
-    "data_path": "{DOWNLOADS_PATH}/elmo-lm-ready4fine-example-data/data/",
-    "train": "train/*",
-    "valid": "heldout/*"
-  },
-  "dataset_iterator": {
-    "class_name": "elmo_file_paths_iterator",
-    "seed": 31415,
-    "unroll_steps": 20,
-    "max_word_length": 50,
-    "n_gpus": 1,
-    "shuffle": false,
-    "bos": "<S>",
-    "eos": "</S>",
-    "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki/vocab.txt",
-    "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki/vocab.txt"
-  },
-  "chainer": {
-    "in": [
-      "x_char_ids"
-    ],
-    "in_y": [
-      "y_token_ids"
-    ],
-    "pipe": [
-      {
-        "class_name": "elmo_model",
-        "options_json_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki/options.json",
-        "unroll_steps": 20,
-        "batch_size": 128,
-        "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki/saves/model",
-        "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki/saves/model",
-        "in": ["x_char_ids", "y_token_ids"],
-        "in_y": [],
-        "n_gpus": 1,
-        "out": ["loss"]
-      }
-    ],
-    "out": [
-      "x_char_ids",
-      "y_token_ids"
-    ]
-  },
-  "train": {
-    "epochs": 20,
-    "batch_size": 128,
-    "log_every_n_batches": 100,
-    "val_every_n_epochs": 1,
-    "validation_patience": 4,
-    "metric_optimization": "minimize",
-    "metrics": [
-      {
-        "name": "elmo_loss2ppl",
-        "inputs": ["loss"]
-      }
-    ],
-    "tensorboard_log_dir": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki/logs",
-    "class_name": "nn_trainer",
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-example-data.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}/"
-      },
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-tuning-ru-wiki.tar.gz",
-        "subdir": "{MODELS_PATH}/"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_wiki_simple.json b/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_wiki_simple.json
deleted file mode 100644
index c4188744e4..0000000000
--- a/deeppavlov/configs/elmo/elmo_lm_ready4fine_tuning_ru_wiki_simple.json
+++ /dev/null
@@ -1,83 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "file_paths_reader",
-    "data_path": "{DOWNLOADS_PATH}/elmo-lm-ready4fine-example-data/data/",
-    "train": "train/*",
-    "valid": "heldout/*"
-  },
-  "dataset_iterator": {
-    "class_name": "elmo_file_paths_iterator",
-    "seed": 31415,
-    "unroll_steps": 20,
-    "max_word_length": 50,
-    "n_gpus": 1,
-    "shuffle": false,
-    "bos": "<S>",
-    "eos": "</S>",
-    "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki-simple/vocab.txt",
-    "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki-simple/vocab.txt"
-  },
-  "chainer": {
-    "in": [
-      "x_char_ids"
-    ],
-    "in_y": [
-      "y_token_ids"
-    ],
-    "pipe": [
-      {
-        "class_name": "elmo_model",
-        "options_json_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki-simple/options.json",
-        "unroll_steps": 20,
-        "batch_size": 128,
-        "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki-simple/saves/model",
-        "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki-simple/saves/model",
-        "in": ["x_char_ids", "y_token_ids"],
-        "in_y": [],
-        "n_gpus": 1,
-        "out": ["loss"]
-      }
-    ],
-    "out": [
-      "x_char_ids",
-      "y_token_ids"
-    ]
-  },
-  "train": {
-    "epochs": 20,
-    "batch_size": 128,
-    "log_every_n_batches": 100,
-    "val_every_n_epochs": 1,
-    "validation_patience": 4,
-    "metric_optimization": "minimize",
-    "metrics": [
-      {
-        "name": "elmo_loss2ppl",
-        "inputs": ["loss"]
-      }
-    ],
-    "tensorboard_log_dir": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-wiki-simple/logs",
-    "class_name": "nn_trainer",
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-example-data.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}/"
-      },
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-tuning-ru-wiki-simple.tar.gz",
-        "subdir": "{MODELS_PATH}/"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/elmo/elmo_paraphraser_fine_tuning.json b/deeppavlov/configs/elmo/elmo_paraphraser_fine_tuning.json
deleted file mode 100644
index fce6382ffd..0000000000
--- a/deeppavlov/configs/elmo/elmo_paraphraser_fine_tuning.json
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "file_paths_reader",
-    "data_path": "{DOWNLOADS_PATH}/paraphraser_train_and_pretrain_texts/",
-    "train": "paraphraser_train_and_pretrain_texts_train.txt",
-    "valid": "paraphraser_train_and_pretrain_texts_valid.txt",
-    "test": "paraphraser_train_and_pretrain_texts_test.txt"
-  },
-  "dataset_iterator": {
-    "class_name": "elmo_file_paths_iterator",
-    "seed": 31415,
-    "unroll_steps": 20,
-    "max_word_length": 50,
-    "n_gpus": 1,
-    "shuffle": false,
-    "bos": "<S>",
-    "eos": "</S>",
-    "save_path": "{MODELS_PATH}/elmo_news_wmt11-16-simple_reduce_vocab/vocab-2016-09-10.txt",
-    "load_path": "{MODELS_PATH}/elmo_news_wmt11-16-simple_reduce_vocab/vocab-2016-09-10.txt"
-  },
-  "chainer": {
-    "in": [
-      "x_char_ids"
-    ],
-    "in_y": [
-      "y_token_ids"
-    ],
-    "pipe": [
-      {
-        "class_name": "elmo_model",
-        "options_json_path": "{MODELS_PATH}/elmo_news_wmt11-16-simple_reduce_vocab/options.json",
-        "unroll_steps": 20,
-        "batch_size": 128,
-        "save_path": "{MODELS_PATH}/elmo_news_wmt11-16-simple_reduce_vocab/saves/model",
-        "load_path": "{MODELS_PATH}/elmo_news_wmt11-16-simple_reduce_vocab/saves/model",
-        "in": ["x_char_ids", "y_token_ids"],
-        "in_y": [],
-        "n_gpus": 1,
-        "out": ["loss"]
-      }
-    ],
-    "out": [
-      "x_char_ids",
-      "y_token_ids"
-    ]
-  },
-  "train": {
-    "epochs": 1,
-    "batch_size": 128,
-    "log_every_n_batches": 24,
-    "val_every_n_epochs": 1,
-    "validation_patience": 1,
-    "metric_optimization": "minimize",
-    "metrics": [
-      {
-        "name": "elmo_loss2ppl",
-        "inputs": ["loss"]
-      }
-    ],
-    "tensorboard_log_dir": "{MODELS_PATH}/elmo_news_wmt11-16-simple_reduce_vocab/logs",
-    "class_name": "nn_trainer",
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/paraphraser_train_and_pretrain_texts.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}/paraphraser_train_and_pretrain_texts"
-      },
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo_news_wmt11-16-simple_reduce_vocab.tar.gz",
-        "subdir": "{MODELS_PATH}/"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json
index 0e5891e11f..51157417e7 100644
--- a/deeppavlov/core/common/registry.json
+++ b/deeppavlov/core/common/registry.json
@@ -47,8 +47,6 @@
   "dstc2_reader": "deeppavlov.dataset_readers.dstc2_reader:DSTC2DatasetReader",
   "dstc_slotfilling": "deeppavlov.models.slotfill.slotfill:DstcSlotFillingNetwork",
   "elmo_embedder": "deeppavlov.models.embedders.elmo_embedder:ELMoEmbedder",
-  "elmo_file_paths_iterator": "deeppavlov.dataset_iterators.elmo_file_paths_iterator:ELMoFilePathsIterator",
-  "elmo_model": "deeppavlov.models.elmo.elmo:ELMo",
   "emb_mat_assembler": "deeppavlov.models.preprocessors.assemble_embeddings_matrix:EmbeddingsMatrixAssembler",
   "entity_detection_parser": "deeppavlov.models.kbqa.entity_detection_parser:EntityDetectionParser",
   "entity_linker": "deeppavlov.models.kbqa.entity_linking:EntityLinker",
diff --git a/deeppavlov/core/common/requirements_registry.json b/deeppavlov/core/common/requirements_registry.json
index 25754cab2d..9434087155 100644
--- a/deeppavlov/core/common/requirements_registry.json
+++ b/deeppavlov/core/common/requirements_registry.json
@@ -75,10 +75,6 @@
     "{DEEPPAVLOV_PATH}/requirements/tf.txt",
     "{DEEPPAVLOV_PATH}/requirements/tf-hub.txt"
   ],
-  "elmo_model": [
-    "{DEEPPAVLOV_PATH}/requirements/tf.txt",
-    "{DEEPPAVLOV_PATH}/requirements/tf-hub.txt"
-  ],
   "fasttext": [
     "{DEEPPAVLOV_PATH}/requirements/fasttext.txt"
   ],
diff --git a/deeppavlov/dataset_iterators/elmo_file_paths_iterator.py b/deeppavlov/dataset_iterators/elmo_file_paths_iterator.py
deleted file mode 100644
index a887fe8b4c..0000000000
--- a/deeppavlov/dataset_iterators/elmo_file_paths_iterator.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from logging import getLogger
-from pathlib import Path
-from typing import Tuple, Iterator, Optional, Dict, List, Union
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.data.simple_vocab import SimpleVocabulary
-from deeppavlov.core.data.utils import chunk_generator
-from deeppavlov.dataset_iterators.file_paths_iterator import FilePathsIterator
-from deeppavlov.models.preprocessors.str_utf8_encoder import StrUTF8Encoder
-
-log = getLogger(__name__)
-
-
-@register('elmo_file_paths_iterator')
-class ELMoFilePathsIterator(FilePathsIterator):
-    """Dataset iterator for tokenized datasets like 1 Billion Word Benchmark
-    It gets lists of file paths from the data dictionary and returns batches of lines from each file.
-
-    Args:
-        data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values
-        load_path: path to the vocabulary to be load from
-        seed: random seed for data shuffling
-        shuffle: whether to shuffle data during batching
-        unroll_steps: number of unrolling steps
-        n_gpus: number of gpu to use
-        max_word_length: max length of word
-        bos: tag of begin of sentence
-        eos: tag of end of sentence
-
-    """
-
-    def __init__(self,
-                 data: Dict[str, List[Union[str, Path]]],
-                 load_path: Union[str, Path],
-                 seed: Optional[int] = None,
-                 shuffle: bool = True,
-                 unroll_steps: Optional[int] = None,
-                 n_gpus: Optional[int] = None,
-                 max_word_length: Optional[int] = None,
-                 bos: str = "<S>",
-                 eos: str = "</S>",
-                 *args, **kwargs) -> None:
-        self.unroll_steps = unroll_steps
-        self.n_gpus = n_gpus
-        self.bos = bos
-        self.eos = eos
-        self.str_utf8_encoder = StrUTF8Encoder(
-            max_word_length=max_word_length,
-            pad_special_char_use=True,
-            word_boundary_special_char_use=True,
-            sentence_boundary_special_char_use=False,
-            reversed_sentense_tokens=False,
-            bos=self.bos,
-            eos=self.eos,
-            save_path=load_path,
-            load_path=load_path,
-        )
-        self.simple_vocab = SimpleVocabulary(
-            min_freq=2,
-            special_tokens=[self.eos, self.bos, "<UNK>"],
-            unk_token="<UNK>",
-            freq_drop_load=True,
-            save_path=load_path,
-            load_path=load_path,
-        )
-        super().__init__(data, seed, shuffle, *args, **kwargs)
-
-    def _line2ids(self, line):
-        line = [self.bos] + line.split() + [self.eos]
-
-        char_ids = self.str_utf8_encoder(line)
-        reversed_char_ids = list(reversed(char_ids))
-        char_ids = char_ids[:-1]
-        reversed_char_ids = reversed_char_ids[:-1]
-
-        token_ids = self.simple_vocab(line)
-        reversed_token_ids = list(reversed(token_ids))
-        token_ids = token_ids[1:]
-        reversed_token_ids = reversed_token_ids[1:]
-
-        return char_ids, reversed_char_ids, token_ids, reversed_token_ids
-
-    def _line_generator(self, shard_generator):
-        for shard in shard_generator:
-            line_generator = chunk_generator(shard, 1)
-            for line in line_generator:
-                line = line[0]
-                char_ids, reversed_char_ids, token_ids, reversed_token_ids = \
-                    self._line2ids(line)
-                yield char_ids, reversed_char_ids, token_ids, reversed_token_ids
-
-    @staticmethod
-    def _batch_generator(line_generator, batch_size, unroll_steps):
-        batch = [[[] for i in range(4)] for i in range(batch_size)]
-        stream = [[[] for i in range(4)] for i in range(batch_size)]
-
-        try:
-            while True:
-                for batch_item, stream_item in zip(batch, stream):
-                    while len(stream_item[0]) < unroll_steps:
-                        line = next(line_generator)
-                        for sti, lni in zip(stream_item, line):
-                            sti.extend(lni)
-                    for sti, bchi in zip(stream_item, batch_item):
-                        _b = sti[:unroll_steps]
-                        _s = sti[unroll_steps:]
-                        bchi.clear()
-                        _b = _b
-                        bchi.extend(_b)
-
-                        sti.clear()
-                        sti.extend(_s)
-                char_ids, reversed_char_ids, token_ids, reversed_token_ids = \
-                    zip(*batch)
-                yield char_ids, reversed_char_ids, token_ids, reversed_token_ids
-        except StopIteration:
-            pass
-
-    def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: Optional[bool] = None) \
-            -> Iterator[Tuple[str, str]]:
-        if shuffle is None:
-            shuffle = self.shuffle
-
-        tgt_data = self.data[data_type]
-        shard_generator = self._shard_generator(tgt_data, shuffle=shuffle)
-        line_generator = self._line_generator(shard_generator)
-
-        if data_type == 'train':
-            unroll_steps = self.unroll_steps
-            n_gpus = self.n_gpus
-        else:
-            unroll_steps = 1
-            batch_size = 256
-            n_gpus = 1
-
-        batch_generator = self._batch_generator(line_generator, batch_size * n_gpus, unroll_steps)
-
-        for char_ids, reversed_char_ids, token_ids, reversed_token_ids in batch_generator:
-            batch = [(char_ids, reversed_char_ids), (token_ids, reversed_token_ids)]
-            yield batch
diff --git a/deeppavlov/models/elmo/__init__.py b/deeppavlov/models/elmo/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/deeppavlov/models/elmo/bilm_model.py b/deeppavlov/models/elmo/bilm_model.py
deleted file mode 100644
index cc7eacb8b0..0000000000
--- a/deeppavlov/models/elmo/bilm_model.py
+++ /dev/null
@@ -1,510 +0,0 @@
-# originally based on https://github.com/allenai/bilm-tf/blob/master/bilm/training.py
-
-# Modifications copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import tensorflow as tf
-
-DTYPE = 'float32'
-DTYPE_INT = 'int64'
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-
-class LanguageModel(object):
-    """
-    A class to build the tensorflow computational graph for NLMs
-
-    All hyperparameters and model configuration is specified in a dictionary
-    of 'options'.
-
-    is_training is a boolean used to control behavior of dropout layers
-        and softmax.  Set to False for testing.
-
-    The LSTM cell is controlled by the 'lstm' key in options
-    Here is an example:
-
-     'lstm': {
-      'cell_clip': 5,
-      'dim': 4096,
-      'n_layers': 2,
-      'proj_clip': 5,
-      'projection_dim': 512,
-      'use_skip_connections': True},
-
-        'projection_dim' is assumed token embedding size and LSTM output size.
-        'dim' is the hidden state size.
-        Set 'dim' == 'projection_dim' to skip a projection layer.
-    """
-
-    def __init__(self, options, is_training):
-        self.options = options
-        self.is_training = is_training
-        self.bidirectional = options.get('bidirectional', False)
-
-        # use word or char inputs?
-        self.char_inputs = 'char_cnn' in self.options
-
-        # for the loss function
-        self.share_embedding_softmax = options.get(
-            'share_embedding_softmax', False)
-        if self.char_inputs and self.share_embedding_softmax:
-            raise ValueError("Sharing softmax and embedding weights requires "
-                             "word input")
-
-        self.sample_softmax = options.get('sample_softmax', True)
-
-        self._build()
-
-    def _build_word_embeddings(self):
-        n_tokens_vocab = self.options['n_tokens_vocab']
-        batch_size = self.options['batch_size']
-        unroll_steps = self.options['unroll_steps']
-
-        # LSTM options
-        projection_dim = self.options['lstm']['projection_dim']
-
-        # the input token_ids and word embeddings
-        self.token_ids = tf.placeholder(DTYPE_INT,
-                                        shape=(batch_size, unroll_steps),
-                                        name='token_ids')
-        # the word embeddings
-        with tf.device("/cpu:0"):
-            self.embedding_weights = tf.get_variable(
-                "embedding", [n_tokens_vocab, projection_dim],
-                dtype=DTYPE,
-            )
-            self.embedding = tf.nn.embedding_lookup(self.embedding_weights,
-                                                    self.token_ids)
-
-        # if a bidirectional LM then make placeholders for reverse
-        # model and embeddings
-        if self.bidirectional:
-            self.token_ids_reverse = tf.placeholder(DTYPE_INT,
-                                                    shape=(batch_size, unroll_steps),
-                                                    name='token_ids_reverse')
-            with tf.device("/cpu:0"):
-                self.embedding_reverse = tf.nn.embedding_lookup(
-                    self.embedding_weights, self.token_ids_reverse)
-
-    def _build_word_char_embeddings(self):
-        """
-        options contains key 'char_cnn': {
-
-        'n_characters': 262,
-
-        # includes the start / end characters
-        'max_characters_per_token': 50,
-
-        'filters': [
-            [1, 32],
-            [2, 32],
-            [3, 64],
-            [4, 128],
-            [5, 256],
-            [6, 512],
-            [7, 512]
-        ],
-        'activation': 'tanh',
-
-        # for the character embedding
-        'embedding': {'dim': 16}
-
-        # for highway layers
-        # if omitted, then no highway layers
-        'n_highway': 2,
-        }
-        """
-        batch_size = self.options['batch_size']
-        unroll_steps = self.options['unroll_steps']
-        projection_dim = self.options['lstm']['projection_dim']
-
-        cnn_options = self.options['char_cnn']
-        filters = cnn_options['filters']
-        n_filters = sum(f[1] for f in filters)
-        max_chars = cnn_options['max_characters_per_token']
-        char_embed_dim = cnn_options['embedding']['dim']
-        n_chars = cnn_options['n_characters']
-        if n_chars != 261:
-            raise Exception("Set n_characters=261 for training see a \
-                            https://github.com/allenai/bilm-tf/blob/master/README.md")
-        if cnn_options['activation'] == 'tanh':
-            activation = tf.nn.tanh
-        elif cnn_options['activation'] == 'relu':
-            activation = tf.nn.relu
-
-        # the input character ids
-        self.tokens_characters = tf.placeholder(DTYPE_INT,
-                                                shape=(batch_size, unroll_steps, max_chars),
-                                                name='tokens_characters')
-        # the character embeddings
-        with tf.device("/cpu:0"):
-            self.embedding_weights = tf.get_variable("char_embed", [n_chars, char_embed_dim],
-                                                     dtype=DTYPE,
-                                                     initializer=tf.random_uniform_initializer(-1.0, 1.0))
-            # shape (batch_size, unroll_steps, max_chars, embed_dim)
-            self.char_embedding = tf.nn.embedding_lookup(self.embedding_weights,
-                                                         self.tokens_characters)
-
-            if self.bidirectional:
-                self.tokens_characters_reverse = tf.placeholder(DTYPE_INT,
-                                                                shape=(batch_size, unroll_steps, max_chars),
-                                                                name='tokens_characters_reverse')
-                self.char_embedding_reverse = tf.nn.embedding_lookup(
-                    self.embedding_weights, self.tokens_characters_reverse)
-
-        # the convolutions
-        def make_convolutions(inp, reuse):
-            with tf.variable_scope('CNN', reuse=reuse):
-                convolutions = []
-                for i, (width, num) in enumerate(filters):
-                    if cnn_options['activation'] == 'relu':
-                        # He initialization for ReLU activation
-                        # with char embeddings init between -1 and 1
-                        # w_init = tf.random_normal_initializer(
-                        #    mean=0.0,
-                        #    stddev=np.sqrt(2.0 / (width * char_embed_dim))
-                        # )
-
-                        # Kim et al 2015, +/- 0.05
-                        w_init = tf.random_uniform_initializer(
-                            minval=-0.05, maxval=0.05)
-                    elif cnn_options['activation'] == 'tanh':
-                        # glorot init
-                        w_init = tf.random_normal_initializer(
-                            mean=0.0,
-                            stddev=np.sqrt(1.0 / (width * char_embed_dim))
-                        )
-                    w = tf.get_variable(
-                        "W_cnn_%s" % i,
-                        [1, width, char_embed_dim, num],
-                        initializer=w_init,
-                        dtype=DTYPE)
-                    b = tf.get_variable(
-                        "b_cnn_%s" % i, [num], dtype=DTYPE,
-                        initializer=tf.constant_initializer(0.0))
-
-                    conv = tf.nn.conv2d(inp, w,
-                                        strides=[1, 1, 1, 1],
-                                        padding="VALID") + b
-                    # now max pool
-                    conv = tf.nn.max_pool(conv, [1, 1, max_chars - width + 1, 1],
-                                          [1, 1, 1, 1], 'VALID')
-
-                    # activation
-                    conv = activation(conv)
-                    conv = tf.squeeze(conv, squeeze_dims=[2])
-
-                    convolutions.append(conv)
-
-            return tf.concat(convolutions, 2)
-
-        # for first model, this is False, for others it's True
-        reuse = tf.get_variable_scope().reuse
-        embedding = make_convolutions(self.char_embedding, reuse)
-
-        self.token_embedding_layers = [embedding]
-
-        if self.bidirectional:
-            # re-use the CNN weights from forward pass
-            embedding_reverse = make_convolutions(
-                self.char_embedding_reverse, True)
-
-        # for highway and projection layers:
-        #   reshape from (batch_size, n_tokens, dim) to
-        n_highway = cnn_options.get('n_highway')
-        use_highway = n_highway is not None and n_highway > 0
-        use_proj = n_filters != projection_dim
-
-        if use_highway or use_proj:
-            embedding = tf.reshape(embedding, [-1, n_filters])
-            if self.bidirectional:
-                embedding_reverse = tf.reshape(embedding_reverse,
-                                               [-1, n_filters])
-
-        # set up weights for projection
-        if use_proj:
-            assert n_filters > projection_dim
-            with tf.variable_scope('CNN_proj'):
-                W_proj_cnn = tf.get_variable(
-                    "W_proj", [n_filters, projection_dim],
-                    initializer=tf.random_normal_initializer(
-                        mean=0.0, stddev=np.sqrt(1.0 / n_filters)),
-                    dtype=DTYPE)
-                b_proj_cnn = tf.get_variable(
-                    "b_proj", [projection_dim],
-                    initializer=tf.constant_initializer(0.0),
-                    dtype=DTYPE)
-
-        # apply highways layers
-        def high(x, ww_carry, bb_carry, ww_tr, bb_tr):
-            carry_gate = tf.nn.sigmoid(tf.matmul(x, ww_carry) + bb_carry)
-            transform_gate = tf.nn.relu(tf.matmul(x, ww_tr) + bb_tr)
-            return carry_gate * transform_gate + (1.0 - carry_gate) * x
-
-        if use_highway:
-            highway_dim = n_filters
-
-            for i in range(n_highway):
-                with tf.variable_scope('CNN_high_%s' % i):
-                    W_carry = tf.get_variable(
-                        'W_carry', [highway_dim, highway_dim],
-                        # glorit init
-                        initializer=tf.random_normal_initializer(
-                            mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
-                        dtype=DTYPE)
-                    b_carry = tf.get_variable(
-                        'b_carry', [highway_dim],
-                        initializer=tf.constant_initializer(-2.0),
-                        dtype=DTYPE)
-                    W_transform = tf.get_variable(
-                        'W_transform', [highway_dim, highway_dim],
-                        initializer=tf.random_normal_initializer(
-                            mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
-                        dtype=DTYPE)
-                    b_transform = tf.get_variable(
-                        'b_transform', [highway_dim],
-                        initializer=tf.constant_initializer(0.0),
-                        dtype=DTYPE)
-
-                embedding = high(embedding, W_carry, b_carry,
-                                 W_transform, b_transform)
-                if self.bidirectional:
-                    embedding_reverse = high(embedding_reverse,
-                                             W_carry, b_carry,
-                                             W_transform, b_transform)
-                self.token_embedding_layers.append(tf.reshape(embedding,
-                                                              [batch_size, unroll_steps, highway_dim]))
-
-        # finally project down to projection dim if needed
-        if use_proj:
-            embedding = tf.matmul(embedding, W_proj_cnn) + b_proj_cnn
-            if self.bidirectional:
-                embedding_reverse = tf.matmul(embedding_reverse, W_proj_cnn) \
-                                    + b_proj_cnn
-            self.token_embedding_layers.append(
-                tf.reshape(embedding, [batch_size, unroll_steps, projection_dim])
-            )
-
-        # reshape back to (batch_size, tokens, dim)
-        if use_highway or use_proj:
-            shp = [batch_size, unroll_steps, projection_dim]
-            embedding = tf.reshape(embedding, shp)
-            if self.bidirectional:
-                embedding_reverse = tf.reshape(embedding_reverse, shp)
-
-        # at last assign attributes for remainder of the model
-        self.embedding = embedding
-        if self.bidirectional:
-            self.embedding_reverse = embedding_reverse
-
-    def _build(self):
-        # size of input options
-        batch_size = self.options['batch_size']
-
-        # LSTM options
-        lstm_dim = self.options['lstm']['dim']
-        projection_dim = self.options['lstm']['projection_dim']
-        n_lstm_layers = self.options['lstm'].get('n_layers', 1)
-        dropout = self.options['dropout']
-        keep_prob = 1.0 - dropout
-
-        if self.char_inputs:
-            self._build_word_char_embeddings()
-        else:
-            self._build_word_embeddings()
-
-        # now the LSTMs
-        # these will collect the initial states for the forward
-        #   (and reverse LSTMs if we are doing bidirectional)
-        self.init_lstm_state = []
-        self.final_lstm_state = []
-
-        # get the LSTM inputs
-        if self.bidirectional:
-            lstm_inputs = [self.embedding, self.embedding_reverse]
-        else:
-            lstm_inputs = [self.embedding]
-
-        # now compute the LSTM outputs
-        cell_clip = self.options['lstm'].get('cell_clip')
-        proj_clip = self.options['lstm'].get('proj_clip')
-
-        use_skip_connections = self.options['lstm'].get('use_skip_connections')
-
-        lstm_outputs = []
-        for lstm_num, lstm_input in enumerate(lstm_inputs):
-            lstm_cells = []
-            for i in range(n_lstm_layers):
-                if projection_dim < lstm_dim:
-                    # are projecting down output
-                    lstm_cell = tf.nn.rnn_cell.LSTMCell(
-                        lstm_dim, num_proj=projection_dim,
-                        cell_clip=cell_clip, proj_clip=proj_clip)
-                else:
-                    lstm_cell = tf.nn.rnn_cell.LSTMCell(
-                        lstm_dim,
-                        cell_clip=cell_clip, proj_clip=proj_clip)
-
-                if use_skip_connections:
-                    # ResidualWrapper adds inputs to outputs
-                    if i == 0:
-                        # don't add skip connection from token embedding to
-                        # 1st layer output
-                        pass
-                    else:
-                        # add a skip connection
-                        lstm_cell = tf.nn.rnn_cell.ResidualWrapper(lstm_cell)
-
-                # add dropout
-                if self.is_training:
-                    lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell,
-                                                              input_keep_prob=keep_prob)
-
-                lstm_cells.append(lstm_cell)
-
-            if n_lstm_layers > 1:
-                lstm_cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cells)
-            else:
-                lstm_cell = lstm_cells[0]
-
-            with tf.control_dependencies([lstm_input]):
-                self.init_lstm_state.append(
-                    lstm_cell.zero_state(batch_size, DTYPE))
-                # NOTE: this variable scope is for backward compatibility
-                # with existing models...
-                if self.bidirectional:
-                    with tf.variable_scope('RNN_%s' % lstm_num):
-                        _lstm_output_unpacked, final_state = tf.nn.static_rnn(
-                            lstm_cell,
-                            tf.unstack(lstm_input, axis=1),
-                            initial_state=self.init_lstm_state[-1])
-                else:
-                    _lstm_output_unpacked, final_state = tf.nn.static_rnn(
-                        lstm_cell,
-                        tf.unstack(lstm_input, axis=1),
-                        initial_state=self.init_lstm_state[-1])
-                self.final_lstm_state.append(final_state)
-
-            # (batch_size * unroll_steps, 512)
-            lstm_output_flat = tf.reshape(
-                tf.stack(_lstm_output_unpacked, axis=1), [-1, projection_dim])
-            if self.is_training:
-                # add dropout to output
-                lstm_output_flat = tf.nn.dropout(lstm_output_flat, keep_prob)
-            tf.add_to_collection('lstm_output_embeddings', _lstm_output_unpacked)
-
-            lstm_outputs.append(lstm_output_flat)
-
-        self._build_loss(lstm_outputs)
-
-    def _build_loss(self, lstm_outputs):
-        """
-        Create:
-            self.total_loss: total loss op for training
-            self.softmax_W, softmax_b: the softmax variables
-            self.next_token_id / _reverse: placeholders for gold input
-
-        """
-        batch_size = self.options['batch_size']
-        unroll_steps = self.options['unroll_steps']
-
-        n_tokens_vocab = self.options['n_tokens_vocab']
-
-        # DEFINE next_token_id and *_reverse placeholders for the gold input
-        def _get_next_token_placeholders(suffix):
-            name = 'next_token_id' + suffix
-            id_placeholder = tf.placeholder(DTYPE_INT,
-                                            shape=(batch_size, unroll_steps),
-                                            name=name)
-            return id_placeholder
-
-        # get the window and weight placeholders
-        self.next_token_id = _get_next_token_placeholders('')
-        if self.bidirectional:
-            self.next_token_id_reverse = _get_next_token_placeholders(
-                '_reverse')
-
-        # DEFINE THE SOFTMAX VARIABLES
-        # get the dimension of the softmax weights
-        # softmax dimension is the size of the output projection_dim
-        softmax_dim = self.options['lstm']['projection_dim']
-
-        # the output softmax variables -- they are shared if bidirectional
-        if self.share_embedding_softmax:
-            # softmax_W is just the embedding layer
-            self.softmax_W = self.embedding_weights
-
-        with tf.variable_scope('softmax'), tf.device('/cpu:0'):
-            # Glorit init (std=(1.0 / sqrt(fan_in))
-            softmax_init = tf.random_normal_initializer(0.0, 1.0 / np.sqrt(softmax_dim))
-            if not self.share_embedding_softmax:
-                self.softmax_W = tf.get_variable(
-                    'W', [n_tokens_vocab, softmax_dim],
-                    dtype=DTYPE,
-                    initializer=softmax_init
-                )
-            self.softmax_b = tf.get_variable(
-                'b', [n_tokens_vocab],
-                dtype=DTYPE,
-                initializer=tf.constant_initializer(0.0))
-
-        # now calculate losses
-        # loss for each direction of the LSTM
-        self.individual_train_losses = []
-        self.individual_eval_losses = []
-
-        if self.bidirectional:
-            next_ids = [self.next_token_id, self.next_token_id_reverse]
-        else:
-            next_ids = [self.next_token_id]
-
-        for id_placeholder, lstm_output_flat in zip(next_ids, lstm_outputs):
-            # flatten the LSTM output and next token id gold to shape:
-            # (batch_size * unroll_steps, softmax_dim)
-            # Flatten and reshape the token_id placeholders
-            next_token_id_flat = tf.reshape(id_placeholder, [-1, 1])
-
-            with tf.control_dependencies([lstm_output_flat]):
-                sampled_losses = tf.nn.sampled_softmax_loss(self.softmax_W, self.softmax_b,
-                                                            next_token_id_flat, lstm_output_flat,
-                                                            self.options['n_negative_samples_batch'],
-                                                            self.options['n_tokens_vocab'],
-                                                            num_true=1)
-
-                # get the full softmax loss
-                output_scores = tf.matmul(
-                    lstm_output_flat,
-                    tf.transpose(self.softmax_W)
-                ) + self.softmax_b
-                # NOTE: tf.nn.sparse_softmax_cross_entropy_with_logits
-                #   expects unnormalized output since it performs the
-                #   softmax internally
-                losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
-                    logits=output_scores,
-                    labels=tf.squeeze(next_token_id_flat, squeeze_dims=[1])
-                )
-            sampled_losses = tf.reshape(sampled_losses, [self.options['batch_size'], -1])
-            losses = tf.reshape(losses, [self.options['batch_size'], -1])
-            self.individual_train_losses.append(tf.reduce_mean(sampled_losses, axis=1))
-            self.individual_eval_losses.append(tf.reduce_mean(losses, axis=1))
-
-        # now make the total loss -- it's the train of the individual losses
-        if self.bidirectional:
-            self.total_train_loss = 0.5 * (self.individual_train_losses[0] + self.individual_train_losses[1])
-            self.total_eval_loss = 0.5 * (self.individual_eval_losses[0] + self.individual_eval_losses[1])
-        else:
-            self.total_train_loss = self.individual_train_losses[0]
-            self.total_eval_loss = self.individual_eval_losses[0]
diff --git a/deeppavlov/models/elmo/elmo.py b/deeppavlov/models/elmo/elmo.py
deleted file mode 100644
index f197ae7c15..0000000000
--- a/deeppavlov/models/elmo/elmo.py
+++ /dev/null
@@ -1,601 +0,0 @@
-# originally based on https://github.com/allenai/bilm-tf/blob/master/bilm/training.py
-
-# Modifications copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import json
-from logging import getLogger
-from typing import Optional, List
-
-import numpy as np
-import tensorflow as tf
-from overrides import overrides
-
-from deeppavlov.core.commands.utils import expand_path
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.models.nn_model import NNModel
-from deeppavlov.models.elmo.bilm_model import LanguageModel
-from deeppavlov.models.elmo.elmo2tfhub import export2hub
-from deeppavlov.models.elmo.train_utils import average_gradients, clip_grads, safely_str2int, dump_weights
-
-log = getLogger(__name__)
-
-
-@register('elmo_model')
-class ELMo(NNModel):
-    """
-    The :class:`~deeppavlov.models.elmo.elmo.ELMo` is a deep contextualized word representation that models both
-    complex characteristics of word use (e.g., syntax and semantics), and how these uses vary across linguistic
-    contexts (i.e., to model polysemy).
-
-    You can use this component for LM training, fine tuning, dumping ELMo to a hdf5 file and wrapping it to
-    the tensorflow hub.
-
-
-    Parameters:
-        options_json_path: Path to the json configure.
-        char_cnn: Options of char_cnn. For example {"activation":"relu","embedding":{"dim":16},
-            "filters":[[1,32],[2,32],[3,64],[4,128],[5,256],[6,512],[7,1024]],"max_characters_per_token":50,
-            "n_characters":261,"n_highway":2}
-        bidirectional: Whether to use bidirectional or not.
-        unroll_steps: Number of unrolling steps.
-        n_tokens_vocab: A size of a vocabulary.
-        lstm: Options of lstm. It is a dict of "cell_clip":int, "dim":int, "n_layers":int, "proj_clip":int, 
-            "projection_dim":int, "use_skip_connections":bool
-        dropout: Probability of keeping the network state, values from 0 to 1. 
-        n_negative_samples_batch: Whether to use negative samples batch or not. Number of batch samples.
-        all_clip_norm_val: Clip the gradients.
-        initial_accumulator_value: Whether to use dropout between layers or not.
-        learning_rate: Learning rate to use during the training (usually from 0.1 to 0.0001)
-        n_gpus: Number of gpu to use.
-        seed: Random seed.
-        batch_size: A size of a train batch.
-        load_epoch_num: An index of loading epoch.
-        epoch_load_path: An epoch loading path relative to save_path.
-        epoch_save_path:  An epoch saving path relative to save_path.
-            If epoch_save_path is None then epoch_save_path = epoch_load_path.
-        dumps_save_path: A dump saving path relative to save_path.
-        tf_hub_save_path: A tf_hub saving path relative to save_path.
-
-    To train ELMo representations from a paper `Deep contextualized word representations
-    <https://arxiv.org/abs/1802.05365>`__ you can use multiple GPUs by set ``n_gpus`` parameter.
-
-    You can explicitly specify the path to a json file with hyperparameters of ELMo used to train by
-    ``options_json_path`` parameter.
-    The json file must be the same as the json file from `original ELMo implementation
-    <https://github.com/allenai/bilm-tf>`__. You can define the architecture using the separate parameters.
-
-    Saving the model will take place in directories with some structure, see below example:
-
-    {MODELS_PATH}/
-        elmo_model/
-            saves/
-                epochs/
-                    1/, 2/, .... # directories of epochs
-                dumps/
-                    weights_epoch_n_1.hdf5, weights_epoch_n_2.hdf5, .... # hdf5 files of dumped ELMo weights
-                hubs/
-                    tf_hub_model_epoch_n_1/, tf_hub_model_epoch_n_2/, .... # directories of tensorflow hub wrapped
-                    ELMo
-
-    Intermediate checkpoints saved to `saves` directory.
-    To specify load/save paths use ``load_epoch_num``, ``epoch_load_path``, ``epoch_save_path``, ``dumps_save_path``,
-    ``tf_hub_save_path``.
-
-    Dumping and tf_hub wrapping of ELMo occurs after each epoch.
-
-    For learning the LM model dataset like 1 Billion Word Benchmark dataset is needed.
-    Examples of how datasets should look like you can learn from the configs of the examples below.
-
-    Vocabulary file is a text file, with one token per line, separated by newlines.
-    Each token in the vocabulary is cached as the appropriate 50 character id sequence once.
-    It is recommended to always include the special <S> and </S> tokens (case sensitive) in the vocabulary file.
-
-    For fine-tuning of LM on specific data, it is enough to save base model to path
-    ``{MODELS_PATH}/elmo_model/saves/epochs/0/`` and start training.
-
-    Also for fine-tuning of LM on specific data, you can use pre-trained model for russian language on different
-    datasets.
-
-
-    LM model pre-trained on `ru-news` dataset ( lines = 63M, tokens = 946M, size = 12GB ), model is available by
-    :config:`elmo_lm_ready4fine_tuning_ru_news </elmo/elmo_lm_ready4fine_tuning_ru_news.json>` configuration file
-    or :config:`elmo_lm_ready4fine_tuning_ru_news_simple </elmo/elmo_lm_ready4fine_tuning_ru_news_simple.json>`
-    configuration file.
-
-    LM model pre-trained on `ru-twitter` dataset ( lines = 104M, tokens = 810M, size = 8.5GB ), model is available by
-    :config:`elmo_lm_ready4fine_tuning_ru_twitter </elmo/elmo_lm_ready4fine_tuning_ru_twitter.json>` configuration file
-    or :config:`elmo_lm_ready4fine_tuning_ru_twitter_simple </elmo/elmo_lm_ready4fine_tuning_ru_twitter_simple.json>`
-    configuration file.
-
-    LM model pre-trained on `ru-wiki` dataset ( lines = 1M, tokens = 386M, size = 5GB ), model is available by
-    :config:`elmo_lm_ready4fine_tuning_ru_wiki </elmo/elmo_lm_ready4fine_tuning_ru_wiki.json>` configuration file
-    or :config:`elmo_lm_ready4fine_tuning_ru_wiki_simple </elmo/elmo_lm_ready4fine_tuning_ru_wiki_simple.json>`
-    configuration file.
-
-    `simple` configuration file is a configuration of a model without special tags of output
-    vocab used for first training.
-
-    .. note::
-
-        You need to download about **4 GB** also by default about **32 GB** of RAM and **10 GB** of GPU memory
-        required to running the :config:`elmo_lm_ready4fine_tuning_ru_* </elmo/>`
-        on one GPU.
-
-    After training you can use ``{MODELS_PATH}/elmo_model/saves/hubs/tf_hub_model_epoch_n_*/``
-    as a ``ModuleSpec`` by using `TensorFlow Hub <https://www.tensorflow.org/hub/overview>`__ or by
-    DeepPavlov :class:`~deeppavlov.models.embedders.elmo_embedder.ELMoEmbedder`.
-
-    More about the ELMo model you can get from `original ELMo implementation
-    <https://github.com/allenai/bilm-tf>`__.
-
-
-    If some required packages are missing, install all the requirements by running in command line:
-
-    .. code:: bash
-
-        python -m deeppavlov install <path_to_config>
-
-    where ``<path_to_config>`` is a path to one of the :config:`provided config files <elmo_embedder>`
-    or its name without an extension, for example :
-
-    .. code:: bash
-
-        python -m deeppavlov install elmo_1b_benchmark_test
-        
-    Examples:
-        For a quick start, you can run test training of the test model on small data by this command from bash:
-
-        .. code:: bash
-
-            python -m deeppavlov train deeppavlov/configs/elmo/elmo_1b_benchmark_test.json -d
-
-        To download the prepared `1 Billion Word Benchmark dataset <http://www.statmt.org/lm-benchmark/>`__ and
-        start a training model use this command from bash:
-
-        .. note::
-
-            You need to download about **2 GB** also by default about **10 GB** of RAM and **10 GB** of GPU memory
-            required to running :config:`elmo_1b_benchmark <elmo/elmo_1b_benchmark.json>` on one GPU.
-
-        .. code:: bash
-
-            python -m deeppavlov train deeppavlov/configs/elmo/elmo_1b_benchmark.json -d
-
-        To fine-tune ELMo as LM model on `1 Billion Word Benchmark dataset <http://www.statmt.org/lm-benchmark/>`__
-        use commands from bash :
-
-        .. code:: bash
-
-            # download the prepared 1 Billion Word Benchmark dataset
-            python -m deeppavlov download deeppavlov/configs/elmo/elmo_1b_benchmark.json
-            # copy model checkpoint, network configuration, vocabulary of pre-trained LM model
-            mkdir -p ${MODELS_PATH}/elmo-1b-benchmark/saves/epochs/0
-            cp my_ckpt.data-00000-of-00001 ${MODELS_PATH}/elmo-1b-benchmark/saves/epochs/0/model.data-00000-of-00001
-            cp my_ckpt.index ${MODELS_PATH}/elmo-1b-benchmark/saves/epochs/0/model.index
-            cp my_ckpt.meta ${MODELS_PATH}/elmo-1b-benchmark/saves/epochs/0/model.meta
-            cp checkpoint ${MODELS_PATH}/elmo-1b-benchmark/saves/epochs/0/checkpoint
-            cp my_options.json ${MODELS_PATH}/elmo-1b-benchmark/options.json
-            cp my_vocab {MODELS_PATH}/elmo-1b-benchmark/vocab-2016-09-10.txt
-            # start a fine-tuning
-            python -m deeppavlov train deeppavlov/configs/elmo/elmo_1b_benchmark.json
-
-        After training you can use the ELMo model from tf_hub wrapper by
-        `TensorFlow Hub <https://www.tensorflow.org/hub/overview>`__ or by
-        DeepPavlov :class:`~deeppavlov.models.embedders.elmo_embedder.ELMoEmbedder`:
-
-        >>> from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder
-        >>> spec = f"{MODELS_PATH}/elmo-1b-benchmark_test/saves/hubs/tf_hub_model_epoch_n_1/"
-        >>> elmo = ELMoEmbedder(spec)
-        >>> elmo([['вопрос', 'жизни', 'Вселенной', 'и', 'вообще', 'всего'], ['42']])
-        array([[ 0.00719104,  0.08544601, -0.07179783, ...,  0.10879009,
-                -0.18630421, -0.2189409 ],
-            [ 0.16325025, -0.04736076,  0.12354863, ..., -0.1889013 ,
-                0.04972512,  0.83029324]], dtype=float32)
-
-    """
-
-    def __init__(self,
-                 options_json_path: Optional[str] = None,  # Configure by json file
-                 char_cnn: Optional[dict] = None,  # Net architecture by direct params, use for overwrite a json arch.
-                 bidirectional: Optional[bool] = None,
-                 unroll_steps: Optional[int] = None,
-                 n_tokens_vocab: Optional[int] = None,
-                 lstm: Optional[dict] = None,
-                 dropout: Optional[float] = None,  # Regularization
-                 n_negative_samples_batch: Optional[int] = None,  # Train options
-                 all_clip_norm_val: Optional[float] = None,
-                 initial_accumulator_value: float = 1.0,
-                 learning_rate: float = 2e-1,  # For AdagradOptimizer
-                 n_gpus: int = 1,  # TODO: Add cpu supporting
-                 seed: Optional[int] = None,  # Other
-                 batch_size: int = 128,  # Data params
-                 load_epoch_num: Optional[int] = None,
-                 epoch_load_path: str = 'epochs',
-                 epoch_save_path: Optional[str] = None,
-                 dumps_save_path: str = 'dumps',
-                 tf_hub_save_path: str = 'hubs',
-                 **kwargs) -> None:
-
-        # ================ Checking input args =================
-        if not (options_json_path or (char_cnn and bidirectional and unroll_steps
-                                      and n_tokens_vocab and lstm and dropout and
-                                      n_negative_samples_batch and all_clip_norm_val
-        )):
-            raise Warning('Use options_json_path or/and direct params to set net architecture.')
-        self.options = self._load_options(options_json_path)
-        self._update_arch_options(char_cnn, bidirectional, unroll_steps, n_tokens_vocab, lstm)
-        self._update_other_options(dropout, n_negative_samples_batch, all_clip_norm_val)
-
-        # Special options
-        self.options['learning_rate'] = learning_rate
-        self.options['initial_accumulator_value'] = initial_accumulator_value
-        self.options['seed'] = seed
-        self.options['n_gpus'] = n_gpus
-        self.options['batch_size'] = batch_size
-
-        self.permanent_options = self.options
-
-        self.train_options = {}
-        self.valid_options = {'batch_size': 256, 'unroll_steps': 1, 'n_gpus': 1}
-        self.model_mode = ''
-
-        tf.set_random_seed(seed)
-        np.random.seed(seed)
-
-        super().__init__(**kwargs)
-
-        self.epoch_load_path = epoch_load_path
-
-        if load_epoch_num is None:
-            load_epoch_num = self._get_epoch_from(self.epoch_load_path, None)
-
-        if epoch_save_path is None:
-            self.epoch_save_path = self.epoch_load_path
-
-        self.save_epoch_num = self._get_epoch_from(self.epoch_save_path)
-
-        self.dumps_save_path = dumps_save_path
-        self.tf_hub_save_path = tf_hub_save_path
-
-        self._build_model(train=False, epoch=load_epoch_num)
-
-        self.save()
-        # after building the model and saving to the specified save path
-        # change the way to load intermediate checkpoints
-        self.load_path = self.save_path
-
-    def _load_options(self, options_json_path):
-        if options_json_path:
-            options_json_path = expand_path(options_json_path)
-            with open(options_json_path, 'r') as fin:
-                options = json.load(fin)
-        else:
-            options = {}
-        return options
-
-    def _update_arch_options(self, char_cnn, bidirectional, unroll_steps, n_tokens_vocab, lstm):
-        if char_cnn is not None:
-            self.options['char_cnn'] = char_cnn
-        if bidirectional is not None:
-            self.options['bidirectional'] = bidirectional
-        if unroll_steps is not None:
-            self.options['unroll_steps'] = unroll_steps
-        if n_tokens_vocab is not None:
-            self.options['n_tokens_vocab'] = n_tokens_vocab
-        if lstm is not None:
-            self.options['lstm'] = lstm
-
-    def _update_other_options(self, dropout, n_negative_samples_batch, all_clip_norm_val):
-        if dropout is not None:
-            self.options['dropout'] = dropout
-        if n_negative_samples_batch is not None:
-            self.options['n_negative_samples_batch'] = n_negative_samples_batch
-        if all_clip_norm_val is not None:
-            self.options['all_clip_norm_val'] = all_clip_norm_val
-
-    def _get_epoch_from(self, epoch_load_path, default=0):
-        path = self.load_path
-        path = path.parent / epoch_load_path
-        candidates = path.resolve().glob('[0-9]*')
-        candidates = list(safely_str2int(i.parts[-1]) for i in candidates
-                          if safely_str2int(i.parts[-1]) is not None)
-        epoch_num = max(candidates, default=default)
-        return epoch_num
-
-    def _build_graph(self, graph, train=True):
-        with graph.as_default():
-            with tf.device('/cpu:0'):
-                init_step = 0
-                global_step = tf.get_variable(
-                    'global_step', [],
-                    initializer=tf.constant_initializer(init_step), trainable=False)
-                self.global_step = global_step
-                # set up the optimizer
-                opt = tf.train.AdagradOptimizer(learning_rate=self.options['learning_rate'],
-                                                initial_accumulator_value=1.0)
-
-                # calculate the gradients on each GPU
-                tower_grads = []
-                models = []
-                loss = tf.get_variable(
-                    'train_perplexity', [],
-                    initializer=tf.constant_initializer(0.0), trainable=False)
-                for k in range(self.options['n_gpus']):
-                    with tf.device('/gpu:%d' % k):
-                        with tf.variable_scope('lm', reuse=k > 0):
-                            # calculate the loss for one model replica and get
-                            #   lstm states
-                            model = LanguageModel(self.options, True)
-                            total_train_loss = model.total_train_loss
-                            total_eval_loss = model.total_eval_loss
-                            models.append(model)
-                            # get gradients
-                            grads = opt.compute_gradients(
-                                tf.reduce_mean(total_train_loss) * self.options['unroll_steps'],
-                                aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE,
-                            )
-                            tower_grads.append(grads)
-                            # # keep track of loss across all GPUs
-                            if train:
-                                loss += total_train_loss
-                            else:
-                                loss += total_eval_loss
-
-                # calculate the mean of each gradient across all GPUs
-                grads = average_gradients(tower_grads, self.options['batch_size'], self.options)
-                grads, _ = clip_grads(grads, self.options, True, global_step)
-                loss = loss / self.options['n_gpus']
-                train_op = opt.apply_gradients(grads, global_step=global_step)
-        return models, train_op, loss, graph
-
-    def _init_session(self):
-        sess_config = tf.ConfigProto(allow_soft_placement=True)
-        sess_config.gpu_options.allow_growth = True
-
-        self.sess = tf.Session(config=sess_config)
-        self.sess.run(tf.global_variables_initializer())
-
-        batch_size = self.options['batch_size']
-        unroll_steps = self.options['unroll_steps']
-
-        # get the initial lstm states
-        init_state_tensors = []
-        final_state_tensors = []
-        for model in self.models:
-            init_state_tensors.extend(model.init_lstm_state)
-            final_state_tensors.extend(model.final_lstm_state)
-
-        char_inputs = 'char_cnn' in self.options
-        if char_inputs:
-            max_chars = self.options['char_cnn']['max_characters_per_token']
-
-        if not char_inputs:
-            feed_dict = {
-                model.token_ids:
-                    np.zeros([batch_size, unroll_steps], dtype=np.int64)
-                for model in self.models
-            }
-        else:
-            feed_dict = {
-                model.tokens_characters:
-                    np.zeros([batch_size, unroll_steps, max_chars],
-                             dtype=np.int32)
-                for model in self.models
-            }
-
-        if self.options['bidirectional']:
-            if not char_inputs:
-                feed_dict.update({
-                    model.token_ids_reverse:
-                        np.zeros([batch_size, unroll_steps], dtype=np.int64)
-                    for model in self.models
-                })
-            else:
-                feed_dict.update({
-                    model.tokens_characters_reverse:
-                        np.zeros([batch_size, unroll_steps, max_chars],
-                                 dtype=np.int32)
-                    for model in self.models
-                })
-
-        init_state_values = self.sess.run(init_state_tensors, feed_dict=feed_dict)
-        return init_state_values, init_state_tensors, final_state_tensors
-
-    def _fill_feed_dict(self,
-                        char_ids_batches,
-                        reversed_char_ids_batches,
-                        token_ids_batches=None,
-                        reversed_token_ids_batches=None):
-        # init state tensors
-        feed_dict = {t: v for t, v in zip(self.init_state_tensors, self.init_state_values)}
-
-        for k, model in enumerate(self.models):
-            start = k * self.options['batch_size']
-            end = (k + 1) * self.options['batch_size']
-
-            # character inputs
-            char_ids = char_ids_batches[start:end]  # get char_ids
-
-            feed_dict[model.tokens_characters] = char_ids
-
-            if self.options['bidirectional']:
-                feed_dict[model.tokens_characters_reverse] = \
-                    reversed_char_ids_batches[start:end]  # get tokens_characters_reverse
-
-            if token_ids_batches is not None:
-                feed_dict[model.next_token_id] = token_ids_batches[start:end]  # get next_token_id
-                if self.options['bidirectional']:
-                    feed_dict[model.next_token_id_reverse] = \
-                        reversed_token_ids_batches[start:end]  # get next_token_id_reverse
-
-        return feed_dict
-
-    def __call__(self, x, y, *args, **kwargs) -> List[float]:
-        if len(args) != 0:
-            return []
-        char_ids_batches, reversed_char_ids_batches = x
-        token_ids_batches, reversed_token_ids_batches = y
-
-        feed_dict = self._fill_feed_dict(char_ids_batches, reversed_char_ids_batches, token_ids_batches,
-                                         reversed_token_ids_batches)
-
-        with self.graph.as_default():
-            loss, self.init_state_values = self.sess.run([self.loss, self.final_state_tensors], feed_dict)
-        return loss
-
-    @overrides
-    def load(self, epoch: Optional[int] = None) -> None:
-        """Load model parameters from self.load_path"""
-        path = self.load_path
-        if epoch is not None:
-            path = path.parent / self.epoch_save_path / str(epoch) / path.parts[-1]
-            path.resolve()
-            log.info(f'[loading {epoch} epoch]')
-
-        # path.parent.mkdir(parents=True, exist_ok=True)
-        path = str(path)
-
-        # Check presence of the model files
-        if tf.train.checkpoint_exists(path):
-            log.info(f'[loading model from {path}]')
-            with self.graph.as_default():
-                saver = tf.train.Saver()
-                saver.restore(self.sess, path)
-        else:
-            log.info(f'[A checkpoint not found in  {path}]')
-
-    @overrides
-    def save(self, epoch: Optional[int] = None) -> None:
-        """Save model parameters to self.save_path"""
-        path = self.save_path
-        if epoch is not None:
-            path = path.parent / self.epoch_save_path / str(epoch) / path.parts[-1]
-            path.resolve()
-            log.info(f'[saving {epoch} epoch]')
-
-        path.parent.mkdir(parents=True, exist_ok=True)
-        path = str(path)
-
-        log.info(f'[saving model to {path}]')
-        with self.graph.as_default():
-            saver = tf.train.Saver()
-            saver.save(self.sess, path)
-
-    def train_on_batch(self,
-                       x_char_ids: list,
-                       y_token_ids: list) -> List[float]:
-        """
-        This method is called by trainer to make one training step on one batch.
-
-        Args:
-            x_char_ids:  a batch of char_ids
-            y_token_ids: a batch of token_ids
-
-        Returns:
-            value of loss function on batch
-        """
-
-        char_ids_batches, reversed_char_ids_batches = x_char_ids
-        token_ids_batches, reversed_token_ids_batches = y_token_ids
-
-        feed_dict = self._fill_feed_dict(char_ids_batches, reversed_char_ids_batches,
-                                         token_ids_batches, reversed_token_ids_batches)
-
-        with self.graph.as_default():
-            loss, _, self.init_state_values = self.sess.run([self.loss, self.train_op, self.final_state_tensors],
-                                                            feed_dict)
-
-        return np.mean(loss)
-
-    def _build_model(self, train: bool, epoch: Optional[int] = None, **kwargs):
-
-        if hasattr(self, 'sess'):
-            self.sess.close()
-
-        self.options = copy.deepcopy(self.permanent_options)
-
-        if train:
-            self.options.update(self.train_options)
-            self.options.update(kwargs)
-
-            self.models, self.train_op, self.loss, self.graph = self._build_graph(tf.Graph())
-        else:
-            self.options.update(self.valid_options)
-            self.options.update(kwargs)
-
-            self.models, self.train_op, self.loss, self.graph = self._build_graph(tf.Graph(),
-                                                                                  train=False)
-
-        with self.graph.as_default():
-            self.init_state_values, self.init_state_tensors, self.final_state_tensors = \
-                self._init_session()
-        self.load(epoch)
-
-    def process_event(self, event_name, data):
-        if event_name == 'before_train' and self.model_mode != 'train':
-            self._build_model(train=True)
-            self.model_mode = 'train'
-        elif event_name == 'before_validation' and self.model_mode != 'validation':
-            epoch = self.save_epoch_num + int(data['epochs_done'])
-            self.save(epoch)
-            self.save()
-            self.elmo_export(epoch)
-
-            self._build_model(train=False)
-            self.model_mode = 'validation'
-
-    def elmo_export(self, epoch: Optional[int] = None) -> None:
-        """
-        Dump the trained weights from a model to a HDF5 file and export a TF-Hub module.
-        """
-        if hasattr(self, 'sess'):
-            self.sess.close()
-        path = self.save_path
-        if epoch:
-            from_path = path.parent / self.epoch_save_path / str(epoch) / path.parts[-1]
-            weights_to_path = path.parent / self.dumps_save_path / f'weights_epoch_n_{epoch}.hdf5'
-            tf_hub_to_path = path.parent / self.tf_hub_save_path / f'tf_hub_model_epoch_n_{epoch}'
-            from_path.resolve()
-            weights_to_path.resolve()
-            tf_hub_to_path.resolve()
-            log.info(f'[exporting {epoch} epoch]')
-        else:
-            from_path = path
-            weights_to_path = path.parent / self.dumps_save_path / 'weights.hdf5'
-            tf_hub_to_path = path.parent / self.tf_hub_save_path / 'tf_hub_model'
-
-        weights_to_path.parent.mkdir(parents=True, exist_ok=True)
-        tf_hub_to_path.parent.mkdir(parents=True, exist_ok=True)
-
-        # Check presence of the model files
-        if tf.train.checkpoint_exists(str(from_path)):
-            dump_weights(from_path.parent, weights_to_path, self.permanent_options)
-
-            options = copy.deepcopy(self.permanent_options)
-            options['char_cnn']['n_characters'] = 262
-            export2hub(weights_to_path, tf_hub_to_path, options)
-
-    def destroy(self) -> None:
-        """
-        Delete model from memory
-
-        Returns:
-            None
-        """
-        if hasattr(self, 'sess'):
-            for k in list(self.sess.graph.get_all_collection_keys()):
-                self.sess.graph.clear_collection(k)
-        super().destroy()
diff --git a/deeppavlov/models/elmo/elmo2tfhub.py b/deeppavlov/models/elmo/elmo2tfhub.py
deleted file mode 100644
index a304bf6837..0000000000
--- a/deeppavlov/models/elmo/elmo2tfhub.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-
-import numpy as np
-import tensorflow as tf
-import tensorflow_hub as hub
-
-from deeppavlov.models.elmo.elmo_model import BidirectionalLanguageModel, weight_layers
-
-
-def make_module_spec(options, weight_file):
-    """Makes a module spec.
-
-    Args:
-      options: LM hyperparameters.
-      weight_file: location of the hdf5 file with LM weights.
-
-    Returns:
-      A module spec object used for constructing a TF-Hub module.
-    """
-
-    def module_fn():
-        """Spec function for a token embedding module."""
-        # init
-        _bos_id = 256
-        _eos_id = 257
-        _bow_id = 258
-        _eow_id = 259
-        _pad_id = 260
-
-        _max_word_length = 50
-        _parallel_iterations = 10
-        _max_batch_size = 1024
-
-        id_dtype = tf.int32
-        id_nptype = np.int32
-        max_word_length = tf.constant(_max_word_length, dtype=id_dtype, name='max_word_length')
-
-        version = tf.constant('from_dp_1', dtype=tf.string, name='version')
-
-        # the charcter representation of the begin/end of sentence characters
-        def _make_bos_eos(c):
-            r = np.zeros([_max_word_length], dtype=id_nptype)
-            r[:] = _pad_id
-            r[0] = _bow_id
-            r[1] = c
-            r[2] = _eow_id
-            return tf.constant(r, dtype=id_dtype)
-
-        bos_ids = _make_bos_eos(_bos_id)
-        eos_ids = _make_bos_eos(_eos_id)
-
-        def token2ids(token):
-            with tf.name_scope("token2ids_preprocessor"):
-                char_ids = tf.decode_raw(token, tf.uint8, name='decode_raw2get_char_ids')
-                char_ids = tf.cast(char_ids, tf.int32, name='cast2int_token')
-                char_ids = tf.strided_slice(char_ids, [0], [max_word_length - 2],
-                                            [1], name='slice2resized_token')
-                ids_num = tf.shape(char_ids)[0]
-                fill_ids_num = (_max_word_length - 2) - ids_num
-                pads = tf.fill([fill_ids_num], _pad_id)
-                bow_token_eow_pads = tf.concat([[_bow_id], char_ids, [_eow_id], pads],
-                                               0, name='concat2bow_token_eow_pads')
-                return bow_token_eow_pads
-
-        def sentence_tagging_and_padding(sen_dim):
-            with tf.name_scope("sentence_tagging_and_padding_preprocessor"):
-                sen = sen_dim[0]
-                dim = sen_dim[1]
-                extra_dim = tf.shape(sen)[0] - dim
-                sen = tf.slice(sen, [0, 0], [dim, max_word_length], name='slice2sen')
-
-                bos_sen_eos = tf.concat([[bos_ids], sen, [eos_ids]], 0, name='concat2bos_sen_eos')
-                bos_sen_eos_plus_one = bos_sen_eos + 1
-                bos_sen_eos_pads = tf.pad(bos_sen_eos_plus_one, [[0, extra_dim], [0, 0]],
-                                          "CONSTANT", name='pad2bos_sen_eos_pads')
-                return bos_sen_eos_pads
-
-        # Input placeholders to the biLM.
-        tokens = tf.placeholder(shape=(None, None), dtype=tf.string, name='ph2tokens')
-        sequence_len = tf.placeholder(shape=(None,), dtype=tf.int32, name='ph2sequence_len')
-
-        tok_shape = tf.shape(tokens)
-        line_tokens = tf.reshape(tokens, shape=[-1], name='reshape2line_tokens')
-
-        with tf.device('/cpu:0'):
-            tok_ids = tf.map_fn(
-                token2ids,
-                line_tokens,
-                dtype=tf.int32, back_prop=False, parallel_iterations=_parallel_iterations,
-                name='map_fn2get_tok_ids')
-
-        tok_ids = tf.reshape(tok_ids, [tok_shape[0], tok_shape[1], -1], name='reshape2tok_ids')
-        with tf.device('/cpu:0'):
-            sen_ids = tf.map_fn(
-                sentence_tagging_and_padding,
-                (tok_ids, sequence_len),
-                dtype=tf.int32, back_prop=False, parallel_iterations=_parallel_iterations,
-                name='map_fn2get_sen_ids')
-
-        # Build the biLM graph.
-        bilm = BidirectionalLanguageModel(options, str(weight_file),
-                                          max_batch_size=_max_batch_size)
-
-        embeddings_op = bilm(sen_ids)
-
-        # Get an op to compute ELMo (weighted average of the internal biLM layers)
-        elmo_output = weight_layers('elmo_output', embeddings_op, l2_coef=0.0)
-
-        weighted_op = elmo_output['weighted_op']
-        mean_op = elmo_output['mean_op']
-        word_emb = elmo_output['word_emb']
-        lstm_outputs1 = elmo_output['lstm_outputs1']
-        lstm_outputs2 = elmo_output['lstm_outputs2']
-
-        hub.add_signature("tokens", {"tokens": tokens, "sequence_len": sequence_len},
-                          {"elmo": weighted_op,
-                           "default": mean_op,
-                           "word_emb": word_emb,
-                           "lstm_outputs1": lstm_outputs1,
-                           "lstm_outputs2": lstm_outputs2,
-                           "version": version})
-
-        # #########################Next signature############################# #
-
-        # Input placeholders to the biLM.
-        def_strings = tf.placeholder(shape=(None), dtype=tf.string)
-        def_tokens_sparse = tf.string_split(def_strings)
-        def_tokens_dense = tf.sparse_to_dense(sparse_indices=def_tokens_sparse.indices,
-                                              output_shape=def_tokens_sparse.dense_shape,
-                                              sparse_values=def_tokens_sparse.values,
-                                              default_value=''
-                                              )
-        def_mask = tf.not_equal(def_tokens_dense, '')
-        def_int_mask = tf.cast(def_mask, dtype=tf.int32)
-        def_sequence_len = tf.reduce_sum(def_int_mask, axis=-1)
-
-        def_tok_shape = tf.shape(def_tokens_dense)
-        def_line_tokens = tf.reshape(def_tokens_dense, shape=[-1], name='reshape2line_tokens')
-
-        with tf.device('/cpu:0'):
-            def_tok_ids = tf.map_fn(
-                token2ids,
-                def_line_tokens,
-                dtype=tf.int32, back_prop=False, parallel_iterations=_parallel_iterations,
-                name='map_fn2get_tok_ids')
-
-        def_tok_ids = tf.reshape(def_tok_ids, [def_tok_shape[0], def_tok_shape[1], -1], name='reshape2tok_ids')
-        with tf.device('/cpu:0'):
-            def_sen_ids = tf.map_fn(
-                sentence_tagging_and_padding,
-                (def_tok_ids, def_sequence_len),
-                dtype=tf.int32, back_prop=False, parallel_iterations=_parallel_iterations,
-                name='map_fn2get_sen_ids')
-
-        # Get ops to compute the LM embeddings.
-        def_embeddings_op = bilm(def_sen_ids)
-
-        # Get an op to compute ELMo (weighted average of the internal biLM layers)
-        def_elmo_output = weight_layers('elmo_output', def_embeddings_op, l2_coef=0.0, reuse=True)
-
-        def_weighted_op = def_elmo_output['weighted_op']
-        def_mean_op = def_elmo_output['mean_op']
-        def_word_emb = def_elmo_output['word_emb']
-        def_lstm_outputs1 = def_elmo_output['lstm_outputs1']
-        def_lstm_outputs2 = def_elmo_output['lstm_outputs2']
-
-        hub.add_signature("default", {"strings": def_strings},
-                          {"elmo": def_weighted_op,
-                           "default": def_mean_op,
-                           "word_emb": def_word_emb,
-                           "lstm_outputs1": def_lstm_outputs1,
-                           "lstm_outputs2": def_lstm_outputs2,
-                           "version": version})
-
-    return hub.create_module_spec(module_fn)
-
-
-def export2hub(weight_file, hub_dir, options):
-    """Exports a TF-Hub module
-    """
-
-    spec = make_module_spec(options, str(weight_file))
-
-    try:
-        with tf.Graph().as_default():
-            module = hub.Module(spec)
-
-            with tf.Session() as sess:
-                sess.run(tf.global_variables_initializer())
-                if hub_dir.exists():
-                    shutil.rmtree(hub_dir)
-                module.export(str(hub_dir), sess)
-    finally:
-        pass
diff --git a/deeppavlov/models/elmo/elmo_model.py b/deeppavlov/models/elmo/elmo_model.py
deleted file mode 100644
index 8e475dcedb..0000000000
--- a/deeppavlov/models/elmo/elmo_model.py
+++ /dev/null
@@ -1,730 +0,0 @@
-# originally based on https://github.com/allenai/bilm-tf/blob/master/bilm/model.py
-
-# Modifications copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import h5py
-import numpy as np
-import tensorflow as tf
-
-DTYPE = 'float32'
-
-
-class BidirectionalLanguageModel(object):
-    def __init__(
-            self,
-            options: dict,
-            weight_file: str,
-            use_character_inputs=True,
-            embedding_weight_file=None,
-            max_batch_size=128):
-        """
-        Creates the language model computational graph and loads weights
-
-        Two options for input type:
-            (1) To use character inputs (paired with Batcher)
-                pass use_character_inputs=True, and ids_placeholder
-                of shape (None, None, max_characters_per_token)
-                to __call__
-            (2) To use token ids as input (paired with TokenBatcher),
-                pass use_character_inputs=False and ids_placeholder
-                of shape (None, None) to __call__.
-                In this case, embedding_weight_file is also required input
-
-        options_file: location of the json formatted file with
-                      LM hyperparameters
-        weight_file: location of the hdf5 file with LM weights
-        use_character_inputs: if True, then use character ids as input,
-            otherwise use token ids
-        max_batch_size: the maximum allowable batch size
-        """
-        if not use_character_inputs:
-            if embedding_weight_file is None:
-                raise ValueError(
-                    "embedding_weight_file is required input with "
-                    "not use_character_inputs"
-                )
-
-        self._options = options
-        self._weight_file = weight_file
-        self._embedding_weight_file = embedding_weight_file
-        self._use_character_inputs = use_character_inputs
-        self._max_batch_size = max_batch_size
-
-        self._ops = {}
-        self._graphs = {}
-
-    def __call__(self, ids_placeholder):
-        """
-        Given the input character ids (or token ids), returns a dictionary
-            with tensorflow ops:
-
-            {'lm_embeddings': embedding_op,
-             'lengths': sequence_lengths_op,
-             'mask': op to compute mask}
-
-        embedding_op computes the LM embeddings and is shape
-            (None, 3, None, 1024)
-        lengths_op computes the sequence lengths and is shape (None, )
-        mask computes the sequence mask and is shape (None, None)
-
-        ids_placeholder: a tf.placeholder of type int32.
-            If use_character_inputs=True, it is shape
-                (None, None, max_characters_per_token) and holds the input
-                character ids for a batch
-            If use_character_input=False, it is shape (None, None) and
-                holds the input token ids for a batch
-        """
-        if ids_placeholder in self._ops:
-            # have already created ops for this placeholder, just return them
-            ret = self._ops[ids_placeholder]
-
-        else:
-            # need to create the graph
-            if len(self._ops) == 0:
-                # first time creating the graph, don't reuse variables
-                lm_graph = BidirectionalLanguageModelGraph(
-                    self._options,
-                    self._weight_file,
-                    ids_placeholder,
-                    embedding_weight_file=self._embedding_weight_file,
-                    use_character_inputs=self._use_character_inputs,
-                    max_batch_size=self._max_batch_size)
-            else:
-                with tf.variable_scope('', reuse=True):
-                    lm_graph = BidirectionalLanguageModelGraph(
-                        self._options,
-                        self._weight_file,
-                        ids_placeholder,
-                        embedding_weight_file=self._embedding_weight_file,
-                        use_character_inputs=self._use_character_inputs,
-                        max_batch_size=self._max_batch_size)
-
-            ops = self._build_ops(lm_graph)
-            self._ops[ids_placeholder] = ops
-            self._graphs[ids_placeholder] = lm_graph
-            ret = ops
-
-        return ret
-
-    def _build_ops(self, lm_graph):
-        with tf.control_dependencies([lm_graph.update_state_op]):
-            # get the LM embeddings
-            token_embeddings = lm_graph.embedding
-            layers = [
-                tf.concat([token_embeddings, token_embeddings], axis=2)
-            ]
-
-            n_lm_layers = len(lm_graph.lstm_outputs['forward'])
-            for i in range(n_lm_layers):
-                layers.append(
-                    tf.concat(
-                        [lm_graph.lstm_outputs['forward'][i],
-                         lm_graph.lstm_outputs['backward'][i]],
-                        axis=-1
-                    )
-                )
-
-            # The layers include the BOS/EOS tokens.  Remove them
-            sequence_length_wo_bos_eos = lm_graph.sequence_lengths - 2
-            layers_without_bos_eos = []
-            for layer in layers:
-                layer_wo_bos_eos = layer[:, 1:, :]
-                layer_wo_bos_eos = tf.reverse_sequence(
-                    layer_wo_bos_eos,
-                    lm_graph.sequence_lengths - 1,
-                    seq_axis=1,
-                    batch_axis=0,
-                )
-                layer_wo_bos_eos = layer_wo_bos_eos[:, 1:, :]
-                layer_wo_bos_eos = tf.reverse_sequence(
-                    layer_wo_bos_eos,
-                    sequence_length_wo_bos_eos,
-                    seq_axis=1,
-                    batch_axis=0,
-                )
-                layers_without_bos_eos.append(layer_wo_bos_eos)
-
-            # concatenate the layers
-            lm_embeddings = tf.concat(
-                [tf.expand_dims(t, axis=1) for t in layers_without_bos_eos],
-                axis=1
-            )
-
-            # get the mask op without bos/eos.
-            # tf doesn't support reversing boolean tensors, so cast
-            # to int then back
-            mask_wo_bos_eos = tf.cast(lm_graph.mask[:, 1:], 'int32')
-            mask_wo_bos_eos = tf.reverse_sequence(
-                mask_wo_bos_eos,
-                lm_graph.sequence_lengths - 1,
-                seq_axis=1,
-                batch_axis=0,
-            )
-            mask_wo_bos_eos = mask_wo_bos_eos[:, 1:]
-            mask_wo_bos_eos = tf.reverse_sequence(
-                mask_wo_bos_eos,
-                sequence_length_wo_bos_eos,
-                seq_axis=1,
-                batch_axis=0,
-            )
-            mask_wo_bos_eos = tf.cast(mask_wo_bos_eos, 'bool')
-
-        return {
-            'lm_embeddings': lm_embeddings,
-            'lengths': sequence_length_wo_bos_eos,
-            'token_embeddings': lm_graph.embedding,
-            'mask': mask_wo_bos_eos,
-        }
-
-
-def _pretrained_initializer(varname, weight_file, embedding_weight_file=None):
-    """
-    We'll stub out all the initializers in the pretrained LM with
-    a function that loads the weights from the file
-    """
-    weight_name_map = {}
-    for i in range(2):
-        for j in range(8):  # if we decide to add more layers
-            root = 'RNN_{}/RNN/MultiRNNCell/Cell{}'.format(i, j)
-            weight_name_map[root + '/rnn/lstm_cell/kernel'] = \
-                root + '/LSTMCell/W_0'
-            weight_name_map[root + '/rnn/lstm_cell/bias'] = \
-                root + '/LSTMCell/B'
-            weight_name_map[root + '/rnn/lstm_cell/projection/kernel'] = \
-                root + '/LSTMCell/W_P_0'
-
-    # convert the graph name to that in the checkpoint
-    varname_in_file = varname[5:]
-    if varname_in_file.startswith('RNN'):
-        varname_in_file = weight_name_map[varname_in_file]
-
-    if varname_in_file == 'embedding':
-        with h5py.File(embedding_weight_file, 'r') as fin:
-            # Have added a special 0 index for padding not present
-            # in the original model.
-            embed_weights = fin[varname_in_file][...]
-            weights = np.zeros(
-                (embed_weights.shape[0] + 1, embed_weights.shape[1]),
-                dtype=DTYPE
-            )
-            weights[1:, :] = embed_weights
-    else:
-        with h5py.File(weight_file, 'r') as fin:
-            if varname_in_file == 'char_embed':
-                # Have added a special 0 index for padding not present
-                # in the original model.
-                char_embed_weights = fin[varname_in_file][...]
-                weights = np.zeros(
-                    (char_embed_weights.shape[0] + 1,
-                     char_embed_weights.shape[1]),
-                    dtype=DTYPE
-                )
-                weights[1:, :] = char_embed_weights
-            else:
-                weights = fin[varname_in_file][...]
-
-    # Tensorflow initializers are callables that accept a shape parameter
-    # and some optional kwargs
-    def ret(shape, **kwargs):
-        if list(shape) != list(weights.shape):
-            raise ValueError(
-                "Invalid shape initializing {0}, got {1}, expected {2}".format(
-                    varname_in_file, shape, weights.shape)
-            )
-        return weights
-
-    return ret
-
-
-class BidirectionalLanguageModelGraph(object):
-    """
-    Creates the computational graph and holds the ops necessary for runnint
-    a bidirectional language model
-    """
-
-    def __init__(self, options, weight_file, ids_placeholder,
-                 use_character_inputs=True, embedding_weight_file=None,
-                 max_batch_size=128):
-
-        self.options = options
-        self._max_batch_size = max_batch_size
-        self.ids_placeholder = ids_placeholder
-        self.use_character_inputs = use_character_inputs
-
-        # this custom_getter will make all variables not trainable and
-        # override the default initializer
-        def custom_getter(getter, name, *args, **kwargs):
-            kwargs['trainable'] = False
-            kwargs['initializer'] = _pretrained_initializer(
-                name, weight_file, embedding_weight_file
-            )
-            return getter(name, *args, **kwargs)
-
-        if embedding_weight_file is not None:
-            # get the vocab size
-            with h5py.File(embedding_weight_file, 'r') as fin:
-                # +1 for padding
-                self._n_tokens_vocab = fin['embedding'].shape[0] + 1
-        else:
-            self._n_tokens_vocab = None
-
-        with tf.variable_scope('bilm', custom_getter=custom_getter):
-            self._build()
-
-    def _build(self):
-        if self.use_character_inputs:
-            self._build_word_char_embeddings()
-        else:
-            self._build_word_embeddings()
-        self._build_lstms()
-
-    def _build_word_char_embeddings(self):
-        """
-        options contains key 'char_cnn': {
-
-        'n_characters': 262,
-
-        # includes the start / end characters
-        'max_characters_per_token': 50,
-
-        'filters': [
-            [1, 32],
-            [2, 32],
-            [3, 64],
-            [4, 128],
-            [5, 256],
-            [6, 512],
-            [7, 512]
-        ],
-        'activation': 'tanh',
-
-        # for the character embedding
-        'embedding': {'dim': 16}
-
-        # for highway layers
-        # if omitted, then no highway layers
-        'n_highway': 2,
-        }
-        """
-        projection_dim = self.options['lstm']['projection_dim']
-
-        cnn_options = self.options['char_cnn']
-        filters = cnn_options['filters']
-        n_filters = sum(f[1] for f in filters)
-        max_chars = cnn_options['max_characters_per_token']
-        char_embed_dim = cnn_options['embedding']['dim']
-        n_chars = cnn_options['n_characters']
-        if n_chars != 262:
-            raise Exception("Set n_characters=262 after training see a \
-                            https://github.com/allenai/bilm-tf/blob/master/README.md")
-
-        if cnn_options['activation'] == 'tanh':
-            activation = tf.nn.tanh
-        elif cnn_options['activation'] == 'relu':
-            activation = tf.nn.relu
-
-        # the character embeddings
-        with tf.device("/cpu:0"):
-            self.embedding_weights = tf.get_variable("char_embed", [n_chars, char_embed_dim],
-                                                     dtype=DTYPE,
-                                                     initializer=tf.random_uniform_initializer(-1.0, 1.0))
-            # shape (batch_size, unroll_steps, max_chars, embed_dim)
-            self.char_embedding = tf.nn.embedding_lookup(self.embedding_weights,
-                                                         self.ids_placeholder)
-
-        # the convolutions
-        def make_convolutions(inp):
-            with tf.variable_scope('CNN'):
-                convolutions = []
-                for i, (width, num) in enumerate(filters):
-                    if cnn_options['activation'] == 'relu':
-                        # He initialization for ReLU activation
-                        # with char embeddings init between -1 and 1
-                        # w_init = tf.random_normal_initializer(
-                        #    mean=0.0,
-                        #    stddev=np.sqrt(2.0 / (width * char_embed_dim))
-                        # )
-
-                        # Kim et al 2015, +/- 0.05
-                        w_init = tf.random_uniform_initializer(
-                            minval=-0.05, maxval=0.05)
-                    elif cnn_options['activation'] == 'tanh':
-                        # glorot init
-                        w_init = tf.random_normal_initializer(
-                            mean=0.0,
-                            stddev=np.sqrt(1.0 / (width * char_embed_dim))
-                        )
-                    w = tf.get_variable(
-                        "W_cnn_%s" % i,
-                        [1, width, char_embed_dim, num],
-                        initializer=w_init,
-                        dtype=DTYPE)
-                    b = tf.get_variable(
-                        "b_cnn_%s" % i, [num], dtype=DTYPE,
-                        initializer=tf.constant_initializer(0.0))
-
-                    conv = tf.nn.conv2d(inp, w,
-                                        strides=[1, 1, 1, 1],
-                                        padding="VALID") + b
-                    # now max pool
-                    conv = tf.nn.max_pool(conv, [1, 1, max_chars - width + 1, 1],
-                                          [1, 1, 1, 1], 'VALID')
-
-                    # activation
-                    conv = activation(conv)
-                    conv = tf.squeeze(conv, squeeze_dims=[2])
-
-                    convolutions.append(conv)
-
-            return tf.concat(convolutions, 2)
-
-        embedding = make_convolutions(self.char_embedding)
-
-        # for highway and projection layers
-        n_highway = cnn_options.get('n_highway')
-        use_highway = n_highway is not None and n_highway > 0
-        use_proj = n_filters != projection_dim
-
-        if use_highway or use_proj:
-            #   reshape from (batch_size, n_tokens, dim) to (-1, dim)
-            batch_size_n_tokens = tf.shape(embedding)[0:2]
-            embedding = tf.reshape(embedding, [-1, n_filters])
-
-        # set up weights for projection
-        if use_proj:
-            assert n_filters > projection_dim
-            with tf.variable_scope('CNN_proj'):
-                W_proj_cnn = tf.get_variable(
-                    "W_proj", [n_filters, projection_dim],
-                    initializer=tf.random_normal_initializer(
-                        mean=0.0, stddev=np.sqrt(1.0 / n_filters)),
-                    dtype=DTYPE)
-                b_proj_cnn = tf.get_variable(
-                    "b_proj", [projection_dim],
-                    initializer=tf.constant_initializer(0.0),
-                    dtype=DTYPE)
-
-        # apply highways layers
-        def high(x, ww_carry, bb_carry, ww_tr, bb_tr):
-            carry_gate = tf.nn.sigmoid(tf.matmul(x, ww_carry) + bb_carry)
-            transform_gate = tf.nn.relu(tf.matmul(x, ww_tr) + bb_tr)
-            return carry_gate * transform_gate + (1.0 - carry_gate) * x
-
-        if use_highway:
-            highway_dim = n_filters
-
-            for i in range(n_highway):
-                with tf.variable_scope('CNN_high_%s' % i):
-                    W_carry = tf.get_variable(
-                        'W_carry', [highway_dim, highway_dim],
-                        # glorit init
-                        initializer=tf.random_normal_initializer(
-                            mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
-                        dtype=DTYPE)
-                    b_carry = tf.get_variable(
-                        'b_carry', [highway_dim],
-                        initializer=tf.constant_initializer(-2.0),
-                        dtype=DTYPE)
-                    W_transform = tf.get_variable(
-                        'W_transform', [highway_dim, highway_dim],
-                        initializer=tf.random_normal_initializer(
-                            mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
-                        dtype=DTYPE)
-                    b_transform = tf.get_variable(
-                        'b_transform', [highway_dim],
-                        initializer=tf.constant_initializer(0.0),
-                        dtype=DTYPE)
-
-                embedding = high(embedding, W_carry, b_carry,
-                                 W_transform, b_transform)
-
-        # finally project down if needed
-        if use_proj:
-            embedding = tf.matmul(embedding, W_proj_cnn) + b_proj_cnn
-
-        # reshape back to (batch_size, tokens, dim)
-        if use_highway or use_proj:
-            shp = tf.concat([batch_size_n_tokens, [projection_dim]], axis=0)
-            embedding = tf.reshape(embedding, shp)
-
-        # at last assign attributes for remainder of the model
-        self.embedding = embedding
-
-    def _build_word_embeddings(self):
-        projection_dim = self.options['lstm']['projection_dim']
-
-        # the word embeddings
-        with tf.device("/cpu:0"):
-            self.embedding_weights = tf.get_variable(
-                "embedding", [self._n_tokens_vocab, projection_dim],
-                dtype=DTYPE,
-            )
-            self.embedding = tf.nn.embedding_lookup(self.embedding_weights,
-                                                    self.ids_placeholder)
-
-    def _build_lstms(self):
-        # now the LSTMs
-        # these will collect the initial states for the forward
-        #   (and reverse LSTMs if we are doing bidirectional)
-
-        # parse the options
-        lstm_dim = self.options['lstm']['dim']
-        projection_dim = self.options['lstm']['projection_dim']
-        n_lstm_layers = self.options['lstm'].get('n_layers', 1)
-        cell_clip = self.options['lstm'].get('cell_clip')
-        proj_clip = self.options['lstm'].get('proj_clip')
-        use_skip_connections = self.options['lstm']['use_skip_connections']
-
-        # the sequence lengths from input mask
-        if self.use_character_inputs:
-            mask = tf.reduce_any(self.ids_placeholder > 0, axis=2)
-        else:
-            mask = self.ids_placeholder > 0
-        sequence_lengths = tf.reduce_sum(tf.cast(mask, tf.int32), axis=1)
-        batch_size = tf.shape(sequence_lengths)[0]
-
-        # for each direction, we'll store tensors for each layer
-        self.lstm_outputs = {'forward': [], 'backward': []}
-        self.lstm_state_sizes = {'forward': [], 'backward': []}
-        self.lstm_init_states = {'forward': [], 'backward': []}
-        self.lstm_final_states = {'forward': [], 'backward': []}
-
-        update_ops = []
-        for direction in ['forward', 'backward']:
-            if direction == 'forward':
-                layer_input = self.embedding
-            else:
-                layer_input = tf.reverse_sequence(
-                    self.embedding,
-                    sequence_lengths,
-                    seq_axis=1,
-                    batch_axis=0
-                )
-
-            for i in range(n_lstm_layers):
-                if projection_dim < lstm_dim:
-                    # are projecting down output
-                    lstm_cell = tf.nn.rnn_cell.LSTMCell(
-                        lstm_dim, num_proj=projection_dim,
-                        cell_clip=cell_clip, proj_clip=proj_clip)
-                else:
-                    lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim,
-                                                        cell_clip=cell_clip, proj_clip=proj_clip)
-
-                if use_skip_connections:
-                    # ResidualWrapper adds inputs to outputs
-                    if i == 0:
-                        # don't add skip connection from token embedding to
-                        # 1st layer output
-                        pass
-                    else:
-                        # add a skip connection
-                        lstm_cell = tf.nn.rnn_cell.ResidualWrapper(lstm_cell)
-
-                # collect the input state, run the dynamic rnn, collect
-                # the output
-                # the LSTMs are stateful.  To support multiple batch sizes,
-                # we'll allocate size for states up to max_batch_size,
-                # then use the first batch_size entries for each batch
-                init_states = [
-                    tf.Variable(
-                        tf.zeros([self._max_batch_size, dim]),
-                        trainable=False
-                    )
-                    for dim in lstm_cell.state_size
-                ]
-                batch_init_states = [
-                    state[:batch_size, :] for state in init_states
-                ]
-
-                if direction == 'forward':
-                    i_direction = 0
-                else:
-                    i_direction = 1
-                variable_scope_name = 'RNN_{0}/RNN/MultiRNNCell/Cell{1}'.format(
-                    i_direction, i)
-                with tf.variable_scope(variable_scope_name):
-                    layer_output, final_state = tf.nn.dynamic_rnn(
-                        lstm_cell,
-                        layer_input,
-                        sequence_length=sequence_lengths,
-                        initial_state=tf.nn.rnn_cell.LSTMStateTuple(
-                            *batch_init_states),
-                    )
-
-                self.lstm_state_sizes[direction].append(lstm_cell.state_size)
-                self.lstm_init_states[direction].append(init_states)
-                self.lstm_final_states[direction].append(final_state)
-                if direction == 'forward':
-                    self.lstm_outputs[direction].append(layer_output)
-                else:
-                    self.lstm_outputs[direction].append(
-                        tf.reverse_sequence(
-                            layer_output,
-                            sequence_lengths,
-                            seq_axis=1,
-                            batch_axis=0
-                        )
-                    )
-
-                with tf.control_dependencies([layer_output]):
-                    # update the initial states
-                    for i in range(2):
-                        new_state = tf.concat(
-                            [final_state[i][:batch_size, :],
-                             init_states[i][batch_size:, :]], axis=0)
-                        state_update_op = tf.assign(init_states[i], new_state)
-                        update_ops.append(state_update_op)
-
-                layer_input = layer_output
-
-        self.mask = mask
-        self.sequence_lengths = sequence_lengths
-        self.update_state_op = tf.group(*update_ops)
-
-
-def weight_layers(name, bilm_ops, l2_coef=None,
-                  use_top_only=False, do_layer_norm=False, reuse=False):
-    """
-    Weight the layers of a biLM with trainable scalar weights to
-    compute ELMo representations.
-
-    For each output layer, this returns two ops.  The first computes
-        a layer specific weighted average of the biLM layers, and
-        the second the l2 regularizer loss term.
-    The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES
-
-    Input:
-        name = a string prefix used for the trainable variable names
-        bilm_ops = the tensorflow ops returned to compute internal
-            representations from a biLM.  This is the return value
-            from BidirectionalLanguageModel(...)(ids_placeholder)
-        l2_coef: the l2 regularization coefficient $\lambda$.
-            Pass None or 0.0 for no regularization.
-        use_top_only: if True, then only use the top layer.
-        do_layer_norm: if True, then apply layer normalization to each biLM
-            layer before normalizing
-        reuse: reuse an aggregation variable scope.
-
-    Output:
-        {
-            'weighted_op': op to compute weighted average for output,
-            'regularization_op': op to compute regularization term
-        }
-    """
-
-    def _l2_regularizer(weights):
-        if l2_coef is not None:
-            return l2_coef * tf.reduce_sum(tf.square(weights))
-        else:
-            return 0.0
-
-    # Get ops for computing LM embeddings and mask
-    lm_embeddings = bilm_ops['lm_embeddings']
-    mask = bilm_ops['mask']
-
-    n_lm_layers = int(lm_embeddings.get_shape()[1])
-    lm_dim = int(lm_embeddings.get_shape()[3])
-    # import pdb; pdb.set_trace()
-
-    with tf.control_dependencies([lm_embeddings, mask]):
-        # Cast the mask and broadcast for layer use.
-        mask_float = tf.cast(mask, 'float32')
-        broadcast_mask = tf.expand_dims(mask_float, axis=-1)
-
-        def _do_ln(x):
-            # do layer normalization excluding the mask
-            x_masked = x * broadcast_mask
-            N = tf.reduce_sum(mask_float) * lm_dim
-            mean = tf.reduce_sum(x_masked) / N
-            variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask) ** 2) / N
-            return tf.nn.batch_normalization(
-                x, mean, variance, None, None, 1E-12
-            )
-
-        if use_top_only:
-            layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
-            # just the top layer
-            sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1)
-            # no regularization
-            reg = 0.0
-        else:
-            with tf.variable_scope("aggregation", reuse=reuse):
-                W = tf.get_variable(
-                    '{}_ELMo_W'.format(name),
-                    shape=(n_lm_layers,),
-                    initializer=tf.zeros_initializer,
-                    regularizer=_l2_regularizer,
-                    trainable=True,
-                )
-
-            # normalize the weights
-            normed_weights = tf.split(
-                tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers
-            )
-            # split LM layers
-            layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
-
-            # compute the weighted, normalized LM activations
-            pieces = []
-            for w, t in zip(normed_weights, layers):
-                if do_layer_norm:
-                    pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1)))
-                else:
-                    pieces.append(w * tf.squeeze(t, squeeze_dims=1))
-            sum_pieces = tf.add_n(pieces)
-
-            # get the regularizer
-            reg = [
-                r for r in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
-                if r.name.find('{}_ELMo_W/'.format(name)) >= 0
-            ]
-            if len(reg) != 1:
-                raise ValueError
-
-        # scale the weighted sum by gamma
-
-        with tf.variable_scope("aggregation", reuse=reuse):
-            gamma = tf.get_variable(
-                '{}_ELMo_gamma'.format(name),
-                shape=(1,),
-                initializer=tf.ones_initializer,
-                regularizer=None,
-                trainable=True,
-            )
-
-        weighted_lm_layers = sum_pieces * gamma
-        weighted_lm_layers_masked = sum_pieces * broadcast_mask
-
-        weighted_lm_layers_sum = tf.reduce_sum(weighted_lm_layers_masked, 1)
-
-        mask_sum = tf.reduce_sum(mask_float, 1)
-        mask_sum = tf.maximum(mask_sum, [1])
-
-        weighted_lm_layers_mean = weighted_lm_layers_sum / tf.expand_dims(mask_sum, - 1)
-
-        word_emb_2n = tf.squeeze(layers[0], [1])
-        word_emb_1n = tf.slice(word_emb_2n, [0, 0, 0], [-1, -1, lm_dim // 2])  # to 512
-        lstm_outputs1 = tf.squeeze(layers[1], [1])
-        lstm_outputs2 = tf.squeeze(layers[2], [1])
-
-        ret = {'weighted_op': weighted_lm_layers,
-               'mean_op': weighted_lm_layers_mean,
-               'regularization_op': reg,
-               'word_emb': word_emb_1n,
-               'lstm_outputs1': lstm_outputs1,
-               'lstm_outputs2': lstm_outputs2, }
-
-    return ret
diff --git a/deeppavlov/models/elmo/train_utils.py b/deeppavlov/models/elmo/train_utils.py
deleted file mode 100644
index 4be3c7f4d3..0000000000
--- a/deeppavlov/models/elmo/train_utils.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# originally based on https://github.com/allenai/bilm-tf/blob/master/bilm/training.py
-
-# Modifications copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-
-import h5py
-import tensorflow as tf
-
-from deeppavlov.models.elmo.bilm_model import LanguageModel
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-
-def average_gradients(tower_grads, batch_size, options):
-    # calculate average gradient for each shared variable across all GPUs
-    average_grads = []
-    for grad_and_vars in zip(*tower_grads):
-        # Note that each grad_and_vars looks like the following:
-        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-        # We need to average the gradients across each GPU.
-
-        g0, v0 = grad_and_vars[0]
-
-        if g0 is None:
-            # no gradient for this variable, skip it
-            average_grads.append((g0, v0))
-            continue
-
-        if isinstance(g0, tf.IndexedSlices):
-            # If the gradient is type IndexedSlices then this is a sparse
-            #   gradient with attributes indices and values.
-            # To average, need to concat them individually then create
-            #   a new IndexedSlices object.
-            indices = []
-            values = []
-            for g, v in grad_and_vars:
-                indices.append(g.indices)
-                values.append(g.values)
-            all_indices = tf.concat(indices, 0)
-            avg_values = tf.concat(values, 0) / len(grad_and_vars)
-            # deduplicate across indices
-            av, ai = _deduplicate_indexed_slices(avg_values, all_indices)
-            grad = tf.IndexedSlices(av, ai, dense_shape=g0.dense_shape)
-
-        else:
-            # a normal tensor can just do a simple average
-            grads = []
-            for g, v in grad_and_vars:
-                # Add 0 dimension to the gradients to represent the tower.
-                expanded_g = tf.expand_dims(g, 0)
-                # Append on a 'tower' dimension which we will average over
-                grads.append(expanded_g)
-
-            # Average over the 'tower' dimension.
-            grad = tf.concat(grads, 0)
-            grad = tf.reduce_mean(grad, 0)
-
-        # the Variables are redundant because they are shared
-        # across towers. So.. just return the first tower's pointer to
-        # the Variable.
-        v = grad_and_vars[0][1]
-        grad_and_var = (grad, v)
-
-        average_grads.append(grad_and_var)
-
-    assert len(average_grads) == len(list(zip(*tower_grads)))
-
-    return average_grads
-
-
-def summary_gradient_updates(grads, opt, lr):
-    """get summary ops for the magnitude of gradient updates"""
-
-    # strategy:
-    # make a dict of variable name -> [variable, grad, adagrad slot]
-    vars_grads = {}
-    for v in tf.trainable_variables():
-        vars_grads[v.name] = [v, None, None]
-    for g, v in grads:
-        vars_grads[v.name][1] = g
-        vars_grads[v.name][2] = opt.get_slot(v, 'accumulator')
-
-    # now make summaries
-    ret = []
-    for vname, (v, g, a) in vars_grads.items():
-
-        if g is None:
-            continue
-
-        if isinstance(g, tf.IndexedSlices):
-            # a sparse gradient - only take norm of params that are updated
-            updates = lr * g.values
-            if a is not None:
-                updates /= tf.sqrt(tf.gather(a, g.indices))
-        else:
-            updates = lr * g
-            if a is not None:
-                updates /= tf.sqrt(a)
-
-        values_norm = tf.sqrt(tf.reduce_sum(v * v)) + 1.0e-7
-        updates_norm = tf.sqrt(tf.reduce_sum(updates * updates))
-        ret.append(tf.summary.scalar('UPDATE/' + vname.replace(":", "_"), updates_norm / values_norm))
-
-    return ret
-
-
-def _deduplicate_indexed_slices(values, indices):
-    """Sums `values` associated with any non-unique `indices`.
-    Args:
-      values: A `Tensor` with rank >= 1.
-      indices: A one-dimensional integer `Tensor`, indexing into the first
-      dimension of `values` (as in an IndexedSlices object).
-    Returns:
-      A tuple of (`summed_values`, `unique_indices`) where `unique_indices` is a
-      de-duplicated version of `indices` and `summed_values` contains the sum of
-      `values` slices associated with each unique index.
-    """
-    unique_indices, new_index_positions = tf.unique(indices)
-    summed_values = tf.unsorted_segment_sum(values,
-                                            new_index_positions,
-                                            tf.shape(unique_indices)[0])
-    return (summed_values, unique_indices)
-
-
-def clip_by_global_norm_summary(t_list, clip_norm, norm_name, variables):
-    # wrapper around tf.clip_by_global_norm that also does summary ops of norms
-
-    # compute norms
-    # use global_norm with one element to handle IndexedSlices vs dense
-    norms = [tf.global_norm([t]) for t in t_list]
-
-    # summary ops before clipping
-    summary_ops = []
-    for ns, v in zip(norms, variables):
-        name = 'norm_pre_clip/' + v.name.replace(":", "_")
-        summary_ops.append(tf.summary.scalar(name, ns))
-
-    # clip
-    clipped_t_list, tf_norm = tf.clip_by_global_norm(t_list, clip_norm)
-
-    # summary ops after clipping
-    norms_post = [tf.global_norm([t]) for t in clipped_t_list]
-    for ns, v in zip(norms_post, variables):
-        name = 'norm_post_clip/' + v.name.replace(":", "_")
-        summary_ops.append(tf.summary.scalar(name, ns))
-
-    summary_ops.append(tf.summary.scalar(norm_name, tf_norm))
-
-    return clipped_t_list, tf_norm, summary_ops
-
-
-def clip_grads(grads, options, do_summaries, global_step):
-    # grads = [(grad1, var1), (grad2, var2), ...]
-    def _clip_norms(grad_and_vars, val, name):
-        # grad_and_vars is a list of (g, v) pairs
-        grad_tensors = [g for g, v in grad_and_vars]
-        vv = [v for g, v in grad_and_vars]
-        scaled_val = val
-        if do_summaries:
-            clipped_tensors, g_norm, so = clip_by_global_norm_summary(
-                grad_tensors, scaled_val, name, vv)
-        else:
-            so = []
-            clipped_tensors, g_norm = tf.clip_by_global_norm(
-                grad_tensors, scaled_val)
-
-        ret = []
-        for t, (g, v) in zip(clipped_tensors, grad_and_vars):
-            ret.append((t, v))
-
-        return ret, so
-
-    all_clip_norm_val = options['all_clip_norm_val']
-    ret, summary_ops = _clip_norms(grads, all_clip_norm_val, 'norm_grad')
-
-    assert len(ret) == len(grads)
-
-    return ret, summary_ops
-
-
-def safely_str2int(in_str: str):
-    try:
-        i = int(in_str)
-    except ValueError:
-        i = None
-    return i
-
-
-def dump_weights(tf_save_dir, outfile, options):
-    """
-    Dump the trained weights from a model to a HDF5 file.
-    """
-
-    def _get_outname(tf_name):
-        outname = re.sub(':0$', '', tf_name)
-        outname = outname.lstrip('lm/')
-        outname = re.sub('/rnn/', '/RNN/', outname)
-        outname = re.sub('/multi_rnn_cell/', '/MultiRNNCell/', outname)
-        outname = re.sub('/cell_', '/Cell', outname)
-        outname = re.sub('/lstm_cell/', '/LSTMCell/', outname)
-        if '/RNN/' in outname:
-            if 'projection' in outname:
-                outname = re.sub('projection/kernel', 'W_P_0', outname)
-            else:
-                outname = re.sub('/kernel', '/W_0', outname)
-                outname = re.sub('/bias', '/B', outname)
-        return outname
-
-    ckpt_file = tf.train.latest_checkpoint(tf_save_dir)
-
-    config = tf.ConfigProto(allow_soft_placement=True)
-    with tf.Graph().as_default():
-        with tf.Session(config=config) as sess:
-            with tf.variable_scope('lm'):
-                LanguageModel(options, False)  # Create graph
-                # we use the "Saver" class to load the variables
-                loader = tf.train.Saver()
-                loader.restore(sess, ckpt_file)
-
-            with h5py.File(outfile, 'w') as fout:
-                for v in tf.trainable_variables():
-                    if v.name.find('softmax') >= 0:
-                        # don't dump these
-                        continue
-                    outname = _get_outname(v.name)
-                    # print("Saving variable {0} with name {1}".format(
-                    #     v.name, outname))
-                    shape = v.get_shape().as_list()
-                    dset = fout.create_dataset(outname, shape, dtype='float32')
-                    values = sess.run([v])[0]
-                    dset[...] = values
diff --git a/docs/apiref/dataset_iterators.rst b/docs/apiref/dataset_iterators.rst
index c5c5c408c6..f95faa595d 100644
--- a/docs/apiref/dataset_iterators.rst
+++ b/docs/apiref/dataset_iterators.rst
@@ -15,8 +15,6 @@ Concrete DatasetIterator classes.
 
 .. autoclass:: deeppavlov.dataset_iterators.dstc2_ner_iterator.Dstc2NerDatasetIterator
 
-.. autoclass:: deeppavlov.dataset_iterators.elmo_file_paths_iterator.ELMoFilePathsIterator
-
 .. autoclass:: deeppavlov.dataset_iterators.file_paths_iterator.FilePathsIterator
 
 .. autoclass:: deeppavlov.dataset_iterators.kvret_dialog_iterator.KvretDialogDatasetIterator
diff --git a/docs/apiref/models/elmo.rst b/docs/apiref/models/elmo.rst
deleted file mode 100644
index f3e2666488..0000000000
--- a/docs/apiref/models/elmo.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-deeppavlov.models.elmo
-======================
-
-.. automodule:: deeppavlov.models.elmo
-
-.. autoclass:: deeppavlov.models.elmo.elmo.ELMo
diff --git a/docs/features/models/neural_ranking.rst b/docs/features/models/neural_ranking.rst
index a02f089f4d..bef692cb83 100644
--- a/docs/features/models/neural_ranking.rst
+++ b/docs/features/models/neural_ranking.rst
@@ -121,7 +121,6 @@ Before using the model make sure that all required packages are installed runnin
 .. code:: bash
 
     python -m deeppavlov install paraphrase_ident_paraphraser
-    python -m deeppavlov install elmo_paraphraser_fine_tuning
 
 To train the model on the `paraphraser.ru`_ dataset with fasttext embeddings one can use the following code in python:
 
@@ -131,15 +130,6 @@ To train the model on the `paraphraser.ru`_ dataset with fasttext embeddings one
 
     para_model = train_model(configs.ranking.paraphrase_ident_paraphraser, download=True)
 
-
-To train the model on the `paraphraser.ru`_ dataset with fine-tuned ELMO embeddings one should first fine-tune ELMO embeddings:
-
-.. code:: python
-
-    from deeppavlov import configs, train_model
-
-    para_model = train_model(configs.elmo.elmo_paraphraser_fine_tuning, download=True)
-
 Training and inference on your own data
 ---------------------------------------
 
diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py
index 2b7d04aa6a..d4b2360f1d 100644
--- a/tests/test_quick_start.py
+++ b/tests/test_quick_start.py
@@ -291,9 +291,6 @@
     "elmo_embedder": {
         ("embedder/elmo_ru_news.json", "embedder_ru_news", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],
     },
-    "elmo_model": {
-        ("elmo/elmo_1b_benchmark_test.json", "elmo_1b_benchmark_test", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],
-    },
     "ranking": {
         ("ranking/ranking_ubuntu_v2_mt.json", "ranking", ('TI',)): [ONE_ARGUMENT_INFER_CHECK],
         ("ranking/ranking_ubuntu_v2_mt_interact.json", "ranking", ('IP',)): [ONE_ARGUMENT_INFER_CHECK],

From 9e718b53ba5f4f89d95da0fe3b9f24d96b3b0e84 Mon Sep 17 00:00:00 2001
From: Fedor Ignatov <ignatov.fedor@gmail.com>
Date: Wed, 20 Oct 2021 16:34:00 +0300
Subject: [PATCH 2/3] refactor: remove deeppavlov.configs.skills (#1499)

* remove: deeppavlov.configs.skills

* delete: aiml_skill component

* remove: rasa_skill component

* remove: DSLSkill component

* docs: removed skills from docs apiref
---
 deeppavlov/configs/skills/aiml_skill.json     |  44 ---
 deeppavlov/configs/skills/dsl_skill.json      |  40 ---
 deeppavlov/configs/skills/rasa_skill.json     |  39 ---
 deeppavlov/core/common/registry.json          |   2 -
 .../core/common/requirements_registry.json    |   7 -
 deeppavlov/requirements/aiml_skill.txt        |   1 -
 deeppavlov/requirements/rasa_skill.txt        |   1 -
 deeppavlov/skills/__init__.py                 |   0
 deeppavlov/skills/aiml_skill/README.md        |   6 -
 deeppavlov/skills/aiml_skill/__init__.py      |   1 -
 deeppavlov/skills/aiml_skill/aiml_skill.py    | 158 ----------
 deeppavlov/skills/dsl_skill/__init__.py       |   3 -
 deeppavlov/skills/dsl_skill/context.py        |  53 ----
 deeppavlov/skills/dsl_skill/dsl_skill.py      | 225 ---------------
 .../skills/dsl_skill/handlers/__init__.py     |   0
 .../skills/dsl_skill/handlers/handler.py      |  68 -----
 .../dsl_skill/handlers/regex_handler.py       |  80 ------
 deeppavlov/skills/dsl_skill/utils.py          |  22 --
 deeppavlov/skills/rasa_skill/__init__.py      |   1 -
 deeppavlov/skills/rasa_skill/rasa_skill.py    | 269 ------------------
 docs/apiref/skills.rst                        |  12 -
 docs/apiref/skills/aiml_skill.rst             |   5 -
 docs/apiref/skills/dsl_skill.rst              |   5 -
 docs/apiref/skills/rasa_skill.rst             |   5 -
 docs/conf.py                                  |   2 +-
 docs/features/skills/aiml_skill.rst           |  44 ---
 docs/features/skills/dsl_skill.rst            |  42 ---
 docs/features/skills/rasa_skill.rst           |  50 ----
 docs/index.rst                                |   3 -
 examples/gobot_md_yaml_configs_tutorial.ipynb |   1 -
 tests/test_aiml_skill.py                      |  37 ---
 tests/test_dsl_skill.py                       | 109 -------
 tests/test_rasa_skill.py                      |  39 ---
 33 files changed, 1 insertion(+), 1373 deletions(-)
 delete mode 100644 deeppavlov/configs/skills/aiml_skill.json
 delete mode 100644 deeppavlov/configs/skills/dsl_skill.json
 delete mode 100644 deeppavlov/configs/skills/rasa_skill.json
 delete mode 100644 deeppavlov/requirements/aiml_skill.txt
 delete mode 100644 deeppavlov/requirements/rasa_skill.txt
 delete mode 100644 deeppavlov/skills/__init__.py
 delete mode 100644 deeppavlov/skills/aiml_skill/README.md
 delete mode 100644 deeppavlov/skills/aiml_skill/__init__.py
 delete mode 100644 deeppavlov/skills/aiml_skill/aiml_skill.py
 delete mode 100644 deeppavlov/skills/dsl_skill/__init__.py
 delete mode 100644 deeppavlov/skills/dsl_skill/context.py
 delete mode 100644 deeppavlov/skills/dsl_skill/dsl_skill.py
 delete mode 100644 deeppavlov/skills/dsl_skill/handlers/__init__.py
 delete mode 100644 deeppavlov/skills/dsl_skill/handlers/handler.py
 delete mode 100644 deeppavlov/skills/dsl_skill/handlers/regex_handler.py
 delete mode 100644 deeppavlov/skills/dsl_skill/utils.py
 delete mode 100644 deeppavlov/skills/rasa_skill/__init__.py
 delete mode 100644 deeppavlov/skills/rasa_skill/rasa_skill.py
 delete mode 100644 docs/apiref/skills.rst
 delete mode 100644 docs/apiref/skills/aiml_skill.rst
 delete mode 100644 docs/apiref/skills/dsl_skill.rst
 delete mode 100644 docs/apiref/skills/rasa_skill.rst
 delete mode 100644 docs/features/skills/aiml_skill.rst
 delete mode 100644 docs/features/skills/dsl_skill.rst
 delete mode 100644 docs/features/skills/rasa_skill.rst
 delete mode 100644 tests/test_aiml_skill.py
 delete mode 100644 tests/test_dsl_skill.py
 delete mode 100644 tests/test_rasa_skill.py

diff --git a/deeppavlov/configs/skills/aiml_skill.json b/deeppavlov/configs/skills/aiml_skill.json
deleted file mode 100644
index 5a454fa4da..0000000000
--- a/deeppavlov/configs/skills/aiml_skill.json
+++ /dev/null
@@ -1,44 +0,0 @@
-{
-  "chainer": {
-    "in": [
-      "utterances_batch",
-      "states_batch"
-    ],
-    "out": [
-      "responses_batch",
-      "confidences_batch",
-      "output_states_batch"
-    ],
-    "pipe": [
-      {
-        "class_name": "aiml_skill",
-        "path_to_aiml_scripts": "{DOWNLOADS_PATH}/aiml_scripts",
-        "positive_confidence": 0.66,
-        "null_response": "I don't know",
-        "null_confidence": 0.33,
-        "in": [
-          "utterances_batch",
-          "states_batch"
-        ],
-        "out": [
-          "responses_batch",
-          "confidences_batch",
-          "output_states_batch"
-        ]
-      }
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/aiml_skill/aiml_scripts.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/skills/dsl_skill.json b/deeppavlov/configs/skills/dsl_skill.json
deleted file mode 100644
index 296c0708ee..0000000000
--- a/deeppavlov/configs/skills/dsl_skill.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-  "chainer": {
-    "in": [
-      "utterances_batch",
-      "user_ids_batch"
-    ],
-    "out": [
-      "responses_batch",
-      "confidences_batch"
-    ],
-    "pipe": [
-      {
-        "class_name": "ru_tokenizer",
-        "in": "utterances_batch",
-        "lowercase": true,
-        "out": "utterance_tokens_batch"
-      },
-      {
-        "class_name": "DSLSkill",
-        "on_invalid_command": "Sorry, I do not understand you",
-        "null_confidence": 0.0,
-        "in": [
-          "utterance_tokens_batch",
-          "user_ids_batch"
-        ],
-        "out": [
-          "responses_batch",
-          "confidences_batch"
-        ]
-      }
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models"
-    }
-  }
-}
\ No newline at end of file
diff --git a/deeppavlov/configs/skills/rasa_skill.json b/deeppavlov/configs/skills/rasa_skill.json
deleted file mode 100644
index 22936c660d..0000000000
--- a/deeppavlov/configs/skills/rasa_skill.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-  "chainer": {
-    "in": [
-      "utterances"
-    ],
-    "out": [
-      "responses_batch",
-      "confidences_batch"
-    ],
-    "pipe": [
-      {
-        "class_name": "rasa_skill",
-        "path_to_models": "{PROJECT_ROOT}/models",
-        "in": [
-          "utterances"
-        ],
-        "out": [
-          "responses_batch",
-          "confidences_batch",
-          "output_states_batch"
-        ]
-      }
-    ]
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models",
-      "PROJECT_ROOT": "{DOWNLOADS_PATH}/rasa_tutorial_project"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/rasa_skill/rasa_tutorial_project.tar.gz",
-        "subdir": "{DOWNLOADS_PATH}"
-      }
-    ]
-  }
-}
diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json
index 51157417e7..2ed9d8e217 100644
--- a/deeppavlov/core/common/registry.json
+++ b/deeppavlov/core/common/registry.json
@@ -1,6 +1,5 @@
 {
   "UD_pymorphy_lemmatizer": "deeppavlov.models.morpho_tagger.lemmatizer:UDPymorphyLemmatizer",
-  "aiml_skill": "deeppavlov.skills.aiml_skill.aiml_skill:AIMLSkill",
   "api_requester": "deeppavlov.models.api_requester.api_requester:ApiRequester",
   "api_router": "deeppavlov.models.api_requester.api_router:ApiRouter",
   "base64_decode_bytesIO": "deeppavlov.models.nemo.common:ascii_to_bytes_io",
@@ -117,7 +116,6 @@
   "query_generator_online": "deeppavlov.models.kbqa.query_generator_online:QueryGeneratorOnline",
   "question_sign_checker": "deeppavlov.models.kbqa.entity_detection_parser:QuestionSignChecker",
   "random_emb_mat": "deeppavlov.models.preprocessors.random_embeddings_matrix:RandomEmbeddingsMatrix",
-  "rasa_skill": "deeppavlov.skills.rasa_skill.rasa_skill:RASASkill",
   "rel_ranker": "deeppavlov.models.ranking.rel_ranker:RelRanker",
   "rel_ranking_bert_infer": "deeppavlov.models.kbqa.rel_ranking_bert_infer:RelRankerBertInfer",
   "rel_ranking_infer": "deeppavlov.models.kbqa.rel_ranking_infer:RelRankerInfer",
diff --git a/deeppavlov/core/common/requirements_registry.json b/deeppavlov/core/common/requirements_registry.json
index 9434087155..7d4f1bc84f 100644
--- a/deeppavlov/core/common/requirements_registry.json
+++ b/deeppavlov/core/common/requirements_registry.json
@@ -3,9 +3,6 @@
     "{DEEPPAVLOV_PATH}/requirements/morpho_tagger.txt",
     "{DEEPPAVLOV_PATH}/requirements/tf.txt"
   ],
-  "aiml_skill": [
-    "{DEEPPAVLOV_PATH}/requirements/aiml_skill.txt"
-  ],
   "bert_classifier": [
     "{DEEPPAVLOV_PATH}/requirements/tf.txt",
     "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt"
@@ -137,10 +134,6 @@
     "{DEEPPAVLOV_PATH}/requirements/morpho_tagger.txt",
     "{DEEPPAVLOV_PATH}/requirements/tf.txt"
   ],
-  "rasa_skill": [
-    "{DEEPPAVLOV_PATH}/requirements/rasa_skill.txt",
-    "{DEEPPAVLOV_PATH}/requirements/tf.txt"
-  ],
   "rel_ranker": [
     "{DEEPPAVLOV_PATH}/requirements/tf.txt"
   ],
diff --git a/deeppavlov/requirements/aiml_skill.txt b/deeppavlov/requirements/aiml_skill.txt
deleted file mode 100644
index 6a6602091e..0000000000
--- a/deeppavlov/requirements/aiml_skill.txt
+++ /dev/null
@@ -1 +0,0 @@
-python-aiml==0.9.3
\ No newline at end of file
diff --git a/deeppavlov/requirements/rasa_skill.txt b/deeppavlov/requirements/rasa_skill.txt
deleted file mode 100644
index bfb2598b2d..0000000000
--- a/deeppavlov/requirements/rasa_skill.txt
+++ /dev/null
@@ -1 +0,0 @@
-git+https://github.com/deepmipt/rasa.git@b0a80916e54ed9f4496c709a28f1093f7a5f2492#egg=rasa==1.2.7
diff --git a/deeppavlov/skills/__init__.py b/deeppavlov/skills/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/deeppavlov/skills/aiml_skill/README.md b/deeppavlov/skills/aiml_skill/README.md
deleted file mode 100644
index cad5e100ed..0000000000
--- a/deeppavlov/skills/aiml_skill/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-This skill wraps python-aiml library and allows developer to integrate AIML scripts into DeepPavlov dialog system.
-
-If you'd like to find more free AIML scripts here is link:
-https://github.com/pandorabots/Free-AIML 
-
-You can set path to folder with your AIML scripts as config param (see attr `path_to_aiml_scripts`).  
\ No newline at end of file
diff --git a/deeppavlov/skills/aiml_skill/__init__.py b/deeppavlov/skills/aiml_skill/__init__.py
deleted file mode 100644
index e5b4b02f6b..0000000000
--- a/deeppavlov/skills/aiml_skill/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .aiml_skill import AIMLSkill
diff --git a/deeppavlov/skills/aiml_skill/aiml_skill.py b/deeppavlov/skills/aiml_skill/aiml_skill.py
deleted file mode 100644
index 51bf6f2360..0000000000
--- a/deeppavlov/skills/aiml_skill/aiml_skill.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import uuid
-from logging import getLogger
-from pathlib import Path
-from typing import Tuple, Optional, List
-
-import aiml
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.models.component import Component
-
-log = getLogger(__name__)
-
-
-@register("aiml_skill")
-class AIMLSkill(Component):
-    """Skill wraps python-aiml library into DeepPavlov interfrace.
-    AIML uses directory with AIML scripts which are loaded at initialization and used as patterns
-    for answering at each step.
-    """
-
-    def __init__(self,
-                 path_to_aiml_scripts: str,
-                 positive_confidence: float = 0.66,
-                 null_response: str = "I don't know what to answer you",
-                 null_confidence: float = 0.33,
-                 **kwargs
-                 ) -> None:
-        """
-        Construct skill:
-            read AIML scripts,
-            load AIML kernel
-
-        Args:
-            path_to_aiml_scripts: string path to folder with AIML scripts
-            null_response: Response string to answer if no AIML Patterns matched
-            positive_confidence: The confidence of response if response was found in AIML scripts
-            null_confidence: The confidence when AIML scripts has no rule for responding and system returns null_response
-        """
-        # we need absolute path (expanded for user home and resolved if it relative path):
-        self.path_to_aiml_scripts = Path(path_to_aiml_scripts).expanduser().resolve()
-        log.info(f"path_to_aiml_scripts is: `{self.path_to_aiml_scripts}`")
-
-        self.positive_confidence = positive_confidence
-        self.null_confidence = null_confidence
-        self.null_response = null_response
-        self.kernel = aiml.Kernel()
-        # to block AIML output:
-        self.kernel._verboseMode = False
-        self._load_scripts()
-
-    def _load_scripts(self) -> None:
-        """
-        Scripts are loaded recursively from files with extensions .xml and .aiml
-        Returns: None
-
-        """
-        # learn kernel to all aimls in directory tree:
-        all_files = sorted(self.path_to_aiml_scripts.rglob('*.*'))
-        learned_files = []
-        for each_file_path in all_files:
-            if each_file_path.suffix in ['.aiml', '.xml']:
-                # learn the script file
-                self.kernel.learn(str(each_file_path))
-                learned_files.append(each_file_path)
-        if not learned_files:
-            log.warning(f"No .aiml or .xml files found for AIML Kernel in directory {self.path_to_aiml_scripts}")
-
-    def process_step(self, utterance_str: str, user_id: any) -> Tuple[str, float]:
-        response = self.kernel.respond(utterance_str, sessionID=user_id)
-        # here put your estimation of confidence:
-        if response:
-            # print(f"AIML responds: {response}")
-            confidence = self.positive_confidence
-        else:
-            # print("AIML responses silently...")
-            response = self.null_response
-            confidence = self.null_confidence
-        return response, confidence
-
-    def _generate_user_id(self) -> str:
-        """Here you put user id generative logic if you want to implement it in the skill.
-
-        Returns:
-            user_id: Random generated user ID.
-
-        """
-        return uuid.uuid1().hex
-
-    def __call__(self,
-                 utterances_batch: List[str],
-                 states_batch: Optional[List] = None) -> Tuple[List[str], List[float], list]:
-        """Returns skill inference result.
-
-        Returns batches of skill inference results, estimated confidence
-        levels and up to date states corresponding to incoming utterance
-        batch.
-
-        Args:
-            utterances_batch: A batch of utterances of str type.
-            states_batch:  A batch of arbitrary typed states for
-                each utterance.
-
-
-        Returns:
-            response: A batch of arbitrary typed skill inference results.
-            confidence: A batch of float typed confidence levels for each of
-                skill inference result.
-            output_states_batch:  A batch of arbitrary typed states for
-                each utterance.
-
-        """
-        # grasp user_ids from states batch.
-        # We expect that skill receives None or dict of state for each utterance.
-        # if state has user_id then skill uses it, otherwise it generates user_id and calls the
-        # user with this name in further.
-
-        # In this implementation we use current datetime for generating uniqe ids
-        output_states_batch = []
-        user_ids = []
-        if states_batch is None:
-            # generate states batch matching batch of utterances:
-            states_batch = [None] * len(utterances_batch)
-
-        for state in states_batch:
-            if not state:
-                user_id = self._generate_user_id()
-                new_state = {'user_id': user_id}
-
-            elif 'user_id' not in state:
-                new_state = state
-                user_id = self._generate_user_id()
-                new_state['user_id'] = self._generate_user_id()
-
-            else:
-                new_state = state
-                user_id = new_state['user_id']
-
-            user_ids.append(user_id)
-            output_states_batch.append(new_state)
-
-        confident_responses = map(self.process_step, utterances_batch, user_ids)
-        responses_batch, confidences_batch = zip(*confident_responses)
-
-        return responses_batch, confidences_batch, output_states_batch
diff --git a/deeppavlov/skills/dsl_skill/__init__.py b/deeppavlov/skills/dsl_skill/__init__.py
deleted file mode 100644
index d2b332d4b6..0000000000
--- a/deeppavlov/skills/dsl_skill/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .context import UserContext
-from .dsl_skill import DSLMeta
-from .utils import SkillResponse, UserId
diff --git a/deeppavlov/skills/dsl_skill/context.py b/deeppavlov/skills/dsl_skill/context.py
deleted file mode 100644
index acbfc6c5b9..0000000000
--- a/deeppavlov/skills/dsl_skill/context.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from typing import Optional, Union, Dict
-
-from deeppavlov.skills.dsl_skill.utils import UserId
-
-
-class UserContext:
-    """
-    UserContext object stores information that the current skill currently knows about the user.
-
-    Args:
-        user_id: id of user
-        message: current message
-        current_state: current user state
-        payload: custom payload dictionary, or a JSON-serialized string of such dictionary
-
-    Attributes:
-        handler_payload: stores information generated by the selected handler
-
-    """
-
-    def __init__(
-            self,
-            user_id: Optional[UserId] = None,
-            message: Optional[str] = None,
-            current_state: Optional[str] = None,
-            payload: Optional[Union[Dict, str]] = None,
-    ):
-        self.user_id = user_id
-        self.message = message
-        self.current_state = current_state
-        self.handler_payload = {}
-
-        # some custom data added by skill creator
-        self.payload = payload
-        if payload == '' or payload is None:
-            self.payload = {}
-        elif isinstance(payload, str):
-            self.payload = json.loads(payload)
diff --git a/deeppavlov/skills/dsl_skill/dsl_skill.py b/deeppavlov/skills/dsl_skill/dsl_skill.py
deleted file mode 100644
index 93e9f8544d..0000000000
--- a/deeppavlov/skills/dsl_skill/dsl_skill.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from abc import ABCMeta
-from collections import defaultdict
-from functools import partial
-from itertools import zip_longest, starmap
-from typing import List, Optional, Dict, Callable, Tuple
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.skills.dsl_skill.context import UserContext
-from deeppavlov.skills.dsl_skill.handlers.handler import Handler
-from deeppavlov.skills.dsl_skill.handlers.regex_handler import RegexHandler
-from deeppavlov.skills.dsl_skill.utils import SkillResponse, UserId
-
-
-class DSLMeta(ABCMeta):
-    """
-    This metaclass is used for creating a skill. Skill is register by its class name in registry.
-
-    Example:
-
-    .. code:: python
-
-            class ExampleSkill(metaclass=DSLMeta):
-                @DSLMeta.handler(commands=["hello", "hey"])
-                def __greeting(context: UserContext):
-                    response = "Hello, my friend!"
-                    confidence = 1.0
-                    return response, confidence
-
-    Attributes:
-        name: class name
-        state_to_handler: dict with states as keys and lists of Handler objects as values
-        user_to_context: dict with user ids as keys and UserContext objects as values
-        universal_handlers: list of handlers that can be activated from any state
-
-    """
-    skill_collection: Dict[str, 'DSLMeta'] = {}
-
-    def __init__(cls, name: str,
-                 bases,
-                 namespace,
-                 **kwargs):
-        super().__init__(name, bases, namespace, **kwargs)
-        cls.name = name
-        cls.state_to_handler = defaultdict(list)
-        cls.user_to_context = defaultdict(UserContext)
-        cls.universal_handlers = []
-
-        handlers = [attribute for attribute in namespace.values() if isinstance(attribute, Handler)]
-
-        for handler in handlers:
-            if handler.state is None:
-                cls.universal_handlers.append(handler)
-            else:
-                cls.state_to_handler[handler.state].append(handler)
-
-        cls.handle = partial(DSLMeta.__handle, cls)
-        cls.__call__ = partial(DSLMeta.__handle_batch, cls)
-        cls.__init__ = partial(DSLMeta.__init__class, cls)
-        register()(cls)
-        DSLMeta.__add_to_collection(cls)
-
-    def __init__class(cls,
-                      on_invalid_command: str = "Простите, я вас не понял",
-                      null_confidence: float = 0,
-                      *args, **kwargs) -> None:
-        """
-        Initialize Skill class
-
-        Args:
-            on_invalid_command: message to be sent on message with no associated handler
-            null_confidence: the confidence when DSL has no handler that fits request
-        """
-        # message to be sent on message with no associated handler
-        cls.on_invalid_command = on_invalid_command
-        cls.null_confidence = null_confidence
-
-    def __handle_batch(cls: 'DSLMeta',
-                       utterances_batch: List[str],
-                       user_ids_batch: List[UserId]) -> Tuple[List, ...]:
-        """Returns skill inference result.
-        Returns batches of skill inference results, estimated confidence
-        levels and up to date states corresponding to incoming utterance
-        batch.
-
-        Args:
-            utterances_batch: A batch of utterances of str type.
-            user_ids_batch: A batch of user ids.
-
-        Returns:
-            response_batch: A batch of arbitrary typed skill inference results.
-            confidence_batch: A batch of float typed confidence levels for each of
-                skill inference result.
-
-        """
-        return (*map(list, zip(*starmap(cls.handle, zip_longest(utterances_batch, user_ids_batch)))),)
-
-    @staticmethod
-    def __add_to_collection(cls: 'DSLMeta') -> None:
-        """
-        Adds Skill class to Skill classes collection
-
-        Args:
-            cls: Skill class
-
-        """
-        DSLMeta.skill_collection[cls.name] = cls
-
-    @staticmethod
-    def __handle(cls: 'DSLMeta',
-                 utterance: str,
-                 user_id: UserId) -> SkillResponse:
-        """
-        Handles what is going to be after a message from user arrived.
-        Simple usage:
-        skill([<message>], [<user_id>])
-
-        Args:
-            cls: instance of callee's class
-            utterance: a message to be handled
-            user_id: id of a user
-
-        Returns:
-            result: handler function's result if succeeded
-
-        """
-        context = cls.user_to_context[user_id]
-
-        context.user_id = user_id
-        context.message = utterance
-
-        current_handler = cls.__select_handler(context)
-        return cls.__run_handler(current_handler, context)
-
-    def __select_handler(cls,
-                         context: UserContext) -> Optional[Callable]:
-        """
-        Selects handler with the highest priority that could be triggered from the passed context.
-
-        Returns:
-             handler function that is selected and None if no handler fits request
-
-        """
-        available_handlers = cls.state_to_handler[context.current_state]
-        available_handlers.extend(cls.universal_handlers)
-        available_handlers.sort(key=lambda h: h.priority, reverse=True)
-        for handler in available_handlers:
-            if handler.check(context):
-                handler.expand_context(context)
-                return handler.func
-
-    def __run_handler(cls, handler: Optional[Callable],
-                      context: UserContext) -> SkillResponse:
-        """
-        Runs specified handler for current context
-
-        Args:
-            handler: handler to be run. If None, on_invalid_command is returned
-            context: user context
-
-        Returns:
-             SkillResponse
-
-        """
-        if handler is None:
-            return SkillResponse(cls.on_invalid_command, cls.null_confidence)
-        try:
-            return SkillResponse(*handler(context=context))
-        except Exception as exc:
-            return SkillResponse(str(exc), 1.0)
-
-    @staticmethod
-    def handler(commands: Optional[List[str]] = None,
-                state: Optional[str] = None,
-                context_condition: Optional[Callable] = None,
-                priority: int = 0) -> Callable:
-        """
-        Decorator to be used in skills' classes.
-        Sample usage:
-
-        .. code:: python
-
-            class ExampleSkill(metaclass=DSLMeta):
-                @DSLMeta.handler(commands=["hello", "hi", "sup", "greetings"])
-                def __greeting(context: UserContext):
-                    response = "Hello, my friend!"
-                    confidence = 1.0
-                    return response, confidence
-
-        Args:
-            priority: integer value to indicate priority. If multiple handlers satisfy
-                          all the requirements, the handler with the greatest priority value will be used
-            context_condition: function that takes context and
-                                  returns True if this handler should be enabled
-                                  and False otherwise. If None, no condition is checked
-            commands: phrases/regexs on what the function wrapped
-                         by this decorator will trigger
-            state: state name
-
-        Returns:
-            function decorated into Handler class
-
-        """
-        if commands is None:
-            commands = [".*"]
-
-        def decorator(func: Callable) -> Handler:
-            return RegexHandler(func, commands,
-                                context_condition=context_condition,
-                                priority=priority, state=state)
-
-        return decorator
diff --git a/deeppavlov/skills/dsl_skill/handlers/__init__.py b/deeppavlov/skills/dsl_skill/handlers/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/deeppavlov/skills/dsl_skill/handlers/handler.py b/deeppavlov/skills/dsl_skill/handlers/handler.py
deleted file mode 100644
index c041404e82..0000000000
--- a/deeppavlov/skills/dsl_skill/handlers/handler.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Optional
-
-from deeppavlov.skills.dsl_skill.context import UserContext
-from deeppavlov.skills.dsl_skill.utils import SkillResponse
-
-
-class Handler:
-    """
-    Handler instance helps DSLMeta class distinguish functions wrapped
-    by @DSLMeta.handler to add them to handlers storage.
-    It also checks if the handler function should be triggered based on the given context.
-
-    Attributes:
-        func: handler function
-        state: state in which handler can be activated
-        priority: priority of the function. If 2 or more handlers can be activated, handler
-         with the highest priority is selected
-        context_condition: predicate that accepts user context and checks if the handler should be activated. Example:
-         `lambda context: context.user_id != 1` checks if user_id is not equal to 1.
-         That means a user with id 1 will be always ignored by the handler.
-
-    """
-
-    def __init__(self,
-                 func: Callable,
-                 state: Optional[str] = None,
-                 context_condition: Optional[Callable] = None,
-                 priority: int = 0):
-        self.func = func
-        self.state = state
-        self.context_condition = context_condition
-        self.priority = priority
-
-    def __call__(self, context: UserContext) -> SkillResponse:
-        return self.func(context)
-
-    def check(self, context: UserContext) -> bool:
-        """
-        Checks:
-         - if the handler function should be triggered based on the given context via context condition.
-
-        Args:
-            context: user context
-
-        Returns:
-            True, if handler should be activated, False otherwise
-        """
-        if self.context_condition is not None:
-            return self.context_condition(context)
-        return True
-
-    def expand_context(self, context: UserContext) -> UserContext:
-        context.handler_payload = {}
-        return context
diff --git a/deeppavlov/skills/dsl_skill/handlers/regex_handler.py b/deeppavlov/skills/dsl_skill/handlers/regex_handler.py
deleted file mode 100644
index 04cf171774..0000000000
--- a/deeppavlov/skills/dsl_skill/handlers/regex_handler.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from typing import List, Callable, Optional
-
-from deeppavlov.skills.dsl_skill.context import UserContext
-from deeppavlov.skills.dsl_skill.handlers.handler import Handler
-
-
-class RegexHandler(Handler):
-    """
-    This handler checks whether the message that is passed to it is matched by a regex.
-
-    Adds the following key to ```context.handler_payload```:
-        - 'regex_groups' - groups parsed from regular expression in command, by name
-
-    Attributes:
-        func: handler function
-        state: state in which handler can be activated
-        priority: priority of the function. If 2 or more handlers can be activated, function
-         with the highest priority is selected
-        context_condition: predicate that accepts user context and checks if the handler should be activated.
-         Example: `lambda context: context.user_id != 1` checks if user_id is not equal to 1.
-         That means a user with id 1 will be always ignored by the handler.
-        commands: handler is activated if regular expression from this list is matched with a user message
-
-    """
-
-    def __init__(self,
-                 func: Callable,
-                 commands: Optional[List[str]] = None,
-                 state: Optional[str] = None,
-                 context_condition: Optional[Callable] = None,
-                 priority: int = 0):
-        super().__init__(func, state, context_condition, priority)
-        self.commands = [re.compile(command) for command in commands]
-
-    def check(self, context: UserContext) -> bool:
-        """
-        Checks:
-         - if the handler function should be triggered based on the given context via context condition.
-         - if at least one of the commands is matched to the `context.message`.
-
-        Args:
-            context: user context
-
-        Returns:
-            True, if handler should be activated, False otherwise
-        """
-        is_previous_matches = super().check(context)
-        if not is_previous_matches:
-            return False
-
-        message = context.message
-        return any(re.search(regexp, ' '.join(message)) for regexp in self.commands)
-
-    def expand_context(self, context: UserContext) -> UserContext:
-        context.handler_payload = {'regex_groups': {}}
-        message = context.message
-        for regexp in self.commands:
-            match = re.search(regexp, ' '.join(message))
-            if match is not None:
-                for group_ind, span in enumerate(match.regs):
-                    context.handler_payload['regex_groups'][group_ind] = message[span[0]: span[1]]
-                for group_name, group_ind in regexp.groupindex.items():
-                    context.handler_payload['regex_groups'][group_name] = \
-                        context.handler_payload['regex_groups'][group_ind]
-                return context
diff --git a/deeppavlov/skills/dsl_skill/utils.py b/deeppavlov/skills/dsl_skill/utils.py
deleted file mode 100644
index 717d52f637..0000000000
--- a/deeppavlov/skills/dsl_skill/utils.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Union, NamedTuple
-
-UserId = Union[str, int]
-
-
-class SkillResponse(NamedTuple):
-    response: str
-    confidence: float
diff --git a/deeppavlov/skills/rasa_skill/__init__.py b/deeppavlov/skills/rasa_skill/__init__.py
deleted file mode 100644
index d694bafa04..0000000000
--- a/deeppavlov/skills/rasa_skill/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .rasa_skill import RASASkill
diff --git a/deeppavlov/skills/rasa_skill/rasa_skill.py b/deeppavlov/skills/rasa_skill/rasa_skill.py
deleted file mode 100644
index e334f4b0ab..0000000000
--- a/deeppavlov/skills/rasa_skill/rasa_skill.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# Copyright 2019 Neural Networks and Deep Learning lab, MIPT
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import logging
-import uuid
-from functools import reduce
-from pathlib import Path
-from typing import Tuple, Optional, List
-
-from rasa.cli.utils import get_validated_path
-from rasa.constants import DEFAULT_MODELS_PATH
-from rasa.core.agent import Agent
-from rasa.core.channels import CollectingOutputChannel
-from rasa.core.channels import UserMessage
-from rasa.model import get_model
-
-from deeppavlov.core.common.registry import register
-from deeppavlov.core.models.component import Component
-
-logger = logging.getLogger(__name__)
-
-
-@register("rasa_skill")
-class RASASkill(Component):
-    """RASASkill lets you to wrap RASA Agent as a Skill within DeepPavlov environment.
-
-    The component requires path to your RASA models (folder with timestamped tar.gz archieves)
-    as you use in command `rasa run -m models --enable-api --log-file out.log`
-
-    """
-
-    def __init__(self, path_to_models: str, **kwargs) -> None:
-        """
-        Constructs RASA Agent as a DeepPavlov skill:
-            read model folder,
-            initialize rasa.core.agent.Agent and wrap it's interfaces
-
-        Args:
-            path_to_models: string path to folder with RASA models
-
-        """
-        # we need absolute path (expanded for user home and resolved if it relative path):
-        self.path_to_models = Path(path_to_models).expanduser().resolve()
-
-        model = get_validated_path(self.path_to_models, "model", DEFAULT_MODELS_PATH)
-
-        model_path = get_model(model)
-        if not model_path:
-            # can not laod model path
-            raise Exception("can not load model path: %s" % model)
-
-        self._agent = Agent.load(model_path)
-        self.ioloop = asyncio.new_event_loop()
-        logger.info(f"path to RASA models is: `{self.path_to_models}`")
-
-    def __call__(self,
-                 utterances_batch: List[str],
-                 states_batch: Optional[List] = None) -> Tuple[List[str], List[float], list]:
-        """Returns skill inference result.
-
-        Returns batches of skill inference results, estimated confidence
-        levels and up to date states corresponding to incoming utterance
-        batch.
-
-        Args:
-            utterances_batch: A batch of utterances of str type.
-            states_batch:  A batch of arbitrary typed states for
-                each utterance.
-
-
-        Returns:
-            response: A batch of arbitrary typed skill inference results.
-            confidence: A batch of float typed confidence levels for each of
-                skill inference result.
-            output_states_batch:  A batch of arbitrary typed states for
-                each utterance.
-
-        """
-        user_ids, output_states_batch = self._handle_user_identification(utterances_batch, states_batch)
-        #################################################################################
-        # RASA use asyncio for handling messages and handle_text is async function,
-        # so we need to instantiate event loop
-        # futures = [rasa_confident_response_decorator(self._agent, utt, sender_id=uid) for utt, uid in
-        futures = [self.rasa_confident_response_decorator(self._agent, utt, sender_id=uid) for utt, uid in
-                   zip(utterances_batch, user_ids)]
-
-        asyncio.set_event_loop(self.ioloop)
-        results = self.ioloop.run_until_complete(asyncio.gather(*futures))
-
-        responses_batch, confidences_batch = zip(*results)
-        return responses_batch, confidences_batch, output_states_batch
-
-    async def rasa_confident_response_decorator(self, rasa_agent, text_message, sender_id):
-        """
-        Args:
-            rasa_agent: rasa.core.agent.Agent instance
-            text_message: str with utterance from user
-            sender_id: id of the user
-
-        Returns: None or tuple with str and float, where first element is a message and second is
-            confidence
-        """
-
-        resp = await self.rasa_handle_text_verbosely(rasa_agent, text_message, sender_id)
-        if resp:
-            responses, confidences, actions = resp
-        else:
-            logger.warning("Null response from RASA Skill")
-            return None
-
-        # for adaptation to deep pavlov arch we need to merge multi-messages into single string:
-        texts = [each_resp['text'] for each_resp in responses if 'text' in each_resp]
-        merged_message = "\n".join(texts)
-
-        merged_confidence = reduce(lambda a, b: a * b, confidences)
-        # TODO possibly it better to choose another function for calculation of final confidence
-        # current realisation of confidence propagation may cause confidence decay for long actions
-        # chains. If long chains is your case, try max(confidence) or confidence[0]
-        return merged_message, merged_confidence
-
-    async def rasa_handle_text_verbosely(self, rasa_agent, text_message, sender_id):
-        """
-        This function reimplements RASA's rasa.core.agent.Agent.handle_text method to allow to retrieve
-        message responses with confidence estimation altogether.
-
-        It reconstructs with merge RASA's methods:
-        https://github.com/RasaHQ/rasa_core/blob/master/rasa/core/agent.py#L401
-        https://github.com/RasaHQ/rasa_core/blob/master/rasa/core/agent.py#L308
-        https://github.com/RasaHQ/rasa/blob/master/rasa/core/processor.py#L327
-
-        This required to allow RASA to output confidences with actions altogether
-        (Out of the box RASA does not support such use case).
-
-        Args:
-            rasa_agent: rasa.core.agent.Agent instance
-            text_message: str with utterance from user
-            sender_id: id of the user
-
-        Returns: None or
-            tuple where first element is a list of messages dicts, the second element is a list
-                of confidence scores for all actions (it is longer than messages list, because some actions
-                does not produce messages)
-
-        """
-        message = UserMessage(text_message,
-                              output_channel=None,
-                              sender_id=sender_id)
-
-        processor = rasa_agent.create_processor()
-        tracker = processor._get_tracker(message.sender_id)
-
-        confidences = []
-        actions = []
-        await processor._handle_message_with_tracker(message, tracker)
-        # save tracker state to continue conversation from this state
-        processor._save_tracker(tracker)
-
-        # here we restore some of logic in RASA management.
-        # ###### Loop of IntraStep decisions  ##########################################################
-        # await processor._predict_and_execute_next_action(msg, tracker):
-        # https://github.com/RasaHQ/rasa/blob/master/rasa/core/processor.py#L327-L362
-        # keep taking actions decided by the policy until it chooses to 'listen'
-        should_predict_another_action = True
-        num_predicted_actions = 0
-
-        def is_action_limit_reached():
-            return (num_predicted_actions == processor.max_number_of_predictions and
-                    should_predict_another_action)
-
-        # action loop. predicts actions until we hit action listen
-        while (should_predict_another_action and
-               processor._should_handle_message(tracker) and
-               num_predicted_actions < processor.max_number_of_predictions):
-            # this actually just calls the policy's method by the same name
-            action, policy, confidence = processor.predict_next_action(tracker)
-
-            confidences.append(confidence)
-            actions.append(action)
-
-            should_predict_another_action = await processor._run_action(
-                action,
-                tracker,
-                message.output_channel,
-                processor.nlg,
-                policy, confidence
-            )
-            num_predicted_actions += 1
-
-        if is_action_limit_reached():
-            # circuit breaker was tripped
-            logger.warning(
-                "Circuit breaker tripped. Stopped predicting "
-                "more actions for sender '{}'".format(tracker.sender_id))
-            if processor.on_circuit_break:
-                # call a registered callback
-                processor.on_circuit_break(tracker, message.output_channel, processor.nlg)
-
-        if isinstance(message.output_channel, CollectingOutputChannel):
-
-            return message.output_channel.messages, confidences, actions
-        else:
-            return None
-
-    def _generate_user_id(self) -> str:
-        """
-        Here you put user id generative logic if you want to implement it in the skill.
-
-        Although it is better to delegate user_id generation to Agent Layer
-        Returns: str
-
-        """
-        return uuid.uuid1().hex
-
-    def _handle_user_identification(self, utterances_batch, states_batch):
-        """Method preprocesses states batch to guarantee that all users are identified (or
-        identifiers are generated for all users).
-
-        Args:
-            utterances_batch: batch of utterances
-            states_batch: batch of states
-
-        Returns:
-
-        """
-        # grasp user_ids from states batch.
-        # We expect that skill receives None or dict of state for each utterance.
-        # if state has user_id then skill uses it, otherwise it generates user_id and calls the
-        # user with this name in further.
-
-        # In this implementation we use current datetime for generating uniqe ids
-        output_states_batch = []
-        user_ids = []
-        if states_batch is None:
-            # generate states batch matching batch of utterances:
-            states_batch = [None] * len(utterances_batch)
-
-        for state in states_batch:
-            if not state:
-                user_id = self._generate_user_id()
-                new_state = {'user_id': user_id}
-
-            elif 'user_id' not in state:
-                new_state = state
-                user_id = self._generate_user_id()
-                new_state['user_id'] = self._generate_user_id()
-
-            else:
-                new_state = state
-                user_id = new_state['user_id']
-
-            user_ids.append(user_id)
-            output_states_batch.append(new_state)
-        return user_ids, output_states_batch
-
-    def destroy(self):
-        self.ioloop.close()
-        super().destroy()
diff --git a/docs/apiref/skills.rst b/docs/apiref/skills.rst
deleted file mode 100644
index bfb47a5e59..0000000000
--- a/docs/apiref/skills.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-skills
-======
-Skill classes. Skills are dialog models.
-
-.. automodule:: deeppavlov.skills
-   :members:
-
-.. toctree::
-   :glob:
-   :caption: Skills
-
-   skills/*
\ No newline at end of file
diff --git a/docs/apiref/skills/aiml_skill.rst b/docs/apiref/skills/aiml_skill.rst
deleted file mode 100644
index 97e7e0ffce..0000000000
--- a/docs/apiref/skills/aiml_skill.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-deeppavlov.skills.aiml_skill
-============================
-
-.. automodule:: deeppavlov.skills.aiml_skill.aiml_skill
-   :members:
diff --git a/docs/apiref/skills/dsl_skill.rst b/docs/apiref/skills/dsl_skill.rst
deleted file mode 100644
index 7bc6cf2fed..0000000000
--- a/docs/apiref/skills/dsl_skill.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-deeppavlov.skills.dsl_skill
-============================================
-
-.. automodule:: deeppavlov.skills.dsl_skill.dsl_skill
-   :members:
diff --git a/docs/apiref/skills/rasa_skill.rst b/docs/apiref/skills/rasa_skill.rst
deleted file mode 100644
index 4dcd93f7ac..0000000000
--- a/docs/apiref/skills/rasa_skill.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-deeppavlov.skills.rasa_skill
-============================
-
-.. automodule:: deeppavlov.skills.rasa_skill.rasa_skill
-   :members:
diff --git a/docs/conf.py b/docs/conf.py
index e9d5e42c9c..b3a4f11237 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -190,7 +190,7 @@
 
 # -- Extension configuration -------------------------------------------------
 
-autodoc_mock_imports = ['aiml', 'bert_dp', 'bs4', 'faiss', 'fastText', 'fasttext', 'gensim', 'hdt', 'kenlm', 'librosa',
+autodoc_mock_imports = ['bert_dp', 'bs4', 'faiss', 'fastText', 'fasttext', 'gensim', 'hdt', 'kenlm', 'librosa',
                         'lxml', 'nemo', 'nemo_asr', 'nemo_tts', 'nltk', 'opt_einsum', 'rapidfuzz', 'rasa',
                         'russian_tagsets', 'sacremoses', 'sortedcontainers', 'spacy', 'tensorflow', 'tensorflow_hub',
                         'torch', 'transformers', 'udapi', 'ufal_udpipe', 'whapi', 'xeger']
diff --git a/docs/features/skills/aiml_skill.rst b/docs/features/skills/aiml_skill.rst
deleted file mode 100644
index ac385c7ea9..0000000000
--- a/docs/features/skills/aiml_skill.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-AIML Skill
-======================
-
-An :doc:`AIML scripts wrapper implementation</apiref/skills/aiml_skill>` that reads a folder with AIML scripts
-(provided by `path_to_aiml_scripts` argument), loads it into AIML's Kernel and responds for incoming utterances
-accroding to patterns learned by AIML Kernel.
-
-For the case when AIML kernel matched utterance and found response the AIML Wrapper outputs response with confidence
-value (as specified by`positive_confidence` argument).
-
-For the case when no match occured the wrapper returns the argument `null_response` as utterance and sets confidence to
-`null_confidence` attribute.
-
-
-Quick Start
------------
-To setup AIML Skill you need load your AIML scripts to some folder and specify path to it with initilization
-parameter `path_to_aiml_scripts`.
-
-You can download bunch of free and ready for use AIML scripts from pandorabots repo:
-https://github.com/pandorabots/Free-AIML
-
-DeepPavlov library has default config for AIMLSkill here: :config:`configs/skills/aiml_skill.json <skills/aiml_skill.json>`
-
-Usage
-^^^^^^^^
-
-.. code:: python
-
-    from deeppavlov.skills.aiml_skill import AIMLSkill
-
-    aiml_skill_config = {
-        'positive_confidence': 0.66,
-        'path_to_aiml_scripts': <put the path to your AIML scripts here>,
-        'null_response': "I don't know what to answer you",
-        'null_confidence': 0.33
-    }
-
-    aiml_skill = AIMLSkill(**aiml_skill_config)
-
-    states_batch = None
-    for utterance in ["Hello", "Hello to the same user_id"]:
-        responses_batch, confidences_batch, states_batch = aiml_skill([utterance], states_batch)
-        print(responses_batch[0])
diff --git a/docs/features/skills/dsl_skill.rst b/docs/features/skills/dsl_skill.rst
deleted file mode 100644
index 919b1661a5..0000000000
--- a/docs/features/skills/dsl_skill.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-DSL Skill
-======================
-
-A :doc:`DSL implementation</apiref/skills/dsl_skill>`. DSL helps to easily create user-defined skills for dialog systems.
-
-For the case when DSL skill matched utterance and found response it outputs response with confidence value.
-
-For the case when no match occurred DSL skill returns the argument `on_invalid_command` ("Простите, я вас не понял" by delault) as utterance and sets confidence to `null_confidence` attribute (0 by default).
-
-`on_invalid_command` and `null_confidence` can be changed in model config
-
-
-Quick Start
------------
-
-DeepPavlov library has default config for DSLSkill here: :config:`configs/skills/dsl_skill.json <skills/dsl_skill.json>`
-
-Usage
-^^^^^^^^
-
-.. code:: python
-
-    from deeppavlov import configs, build_model
-    from deeppavlov.core.common.file import read_json
-    from deeppavlov.skills.dsl_skill import DSLMeta
-
-
-    class DSLSkill(metaclass=DSLMeta):
-        @DSLMeta.handler(commands=["hello", "hi", "sup", "greetings"])
-        def greeting(context):
-            response = "Hello, my friend!"
-            confidence = 1.0
-            return response, confidence
-
-
-    skill_config = read_json(configs.skills.dsl_skill)
-
-    skill = build_model(skill_config, download=True)
-    utterance = "Hello"
-    user_id = 1
-    response = skill([utterance], [user_id])
-    print(response)
diff --git a/docs/features/skills/rasa_skill.rst b/docs/features/skills/rasa_skill.rst
deleted file mode 100644
index 5f8ffdd3db..0000000000
--- a/docs/features/skills/rasa_skill.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-Rasa Skill
-======================
-
-A :class:`Rasa wrapper implementation<deeppavlov.skills.rasa_skill.rasa_skill.RASASkill>` that reads a folder with Rasa models
-(provided by ``path_to_models`` argument), initializes Rasa Agent with this configuration and responds for incoming
-utterances according to responses predicted by Rasa. Each response has confidence value estimated as product of
-scores of executed actions by Rasa system in the current prediction step (each prediction step in Rasa usually consists of
-multiple actions). If Rasa responds with multiple ``BotUttered`` actions, then such phrases are merged into one utterance
-divided by ``'\n'``.
-
-Quick Start
------------
-To setup a Rasa Skill you need to have a working Rasa project at some path, then  you can specify the path to Rasa's
-models (usually it is a folder with name ``models`` inside the project path) at initialization of Rasa Skill class
-by providing ``path_to_models`` attribute.
-
-Dummy Rasa project
-------------------
-DeepPavlov library has :config:`a template config for RASASkill<skills/rasa_skill.json>`.
-This project is in essence a working Rasa project created with ``rasa init`` and ``rasa train`` commands
-with minimal additions. The Rasa bot can greet, answer about what he can do and detect user's mood sentiment.
-
-The template DeepPavlov config specifies only one component (RASASkill) in :doc:`a pipeline</intro/configuration>`.
-The ``metadata.download`` field in configuration allows to download and unpack the gzipped template project into
-subdir ``{DOWNLOADS_PATH}``.
-
-If you create a configuration for a Rasa project hosted on your machine, you don't need to specify ``metadata.download``
-and just need to correctly set ``path_to_models`` of the ``rasa_skill`` component.
-``path_to_models`` needs to be a path to your Rasa's ``models`` directory.
-
-See `Rasa's documentation <https://rasa.com/docs/rasa/1.2.5/user-guide/rasa-tutorial/>`_ for explanation on how
-to create project.
-
-Usage without DeepPavlov configuration files
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: python
-
-    from deeppavlov.skills.rasa_skill import RASASkill
-
-    rasa_skill_config = {
-        'path_to_models': <put the path to your Rasa models>,
-    }
-
-    rasa_skill = RASASkill(**rasa_skill_config)
-
-    states_batch = None
-    for utterance in ["Hello", "Hello to the same user_id"]:
-        responses_batch, confidences_batch, states_batch = rasa_skill([utterance], states_batch)
-        print(responses_batch[0])
diff --git a/docs/index.rst b/docs/index.rst
index 1f5f795c33..723d7c8539 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -54,9 +54,6 @@ Welcome to DeepPavlov's documentation!
    Goal-Oriented Dialogue Bot <features/skills/go_bot>
    Open-Domain Question Answering <features/skills/odqa>
    Frequently Asked Questions Answering <features/skills/faq>
-   AIML <features/skills/aiml_skill>
-   Rasa <features/skills/rasa_skill>
-   DSL <features/skills/dsl_skill>
 
 
 .. toctree::
diff --git a/examples/gobot_md_yaml_configs_tutorial.ipynb b/examples/gobot_md_yaml_configs_tutorial.ipynb
index 69d9431d51..01426e40a2 100644
--- a/examples/gobot_md_yaml_configs_tutorial.ipynb
+++ b/examples/gobot_md_yaml_configs_tutorial.ipynb
@@ -54,7 +54,6 @@
         "At DeepPavlov, we support a variety of industry-wide and popular standards to support developing Conversational AI solutions.\n",
         "DSLs, known as Domain-Specific Languages, provide a rich mechanism to define the behavior, or \"the what\", while\n",
         "the underlying system uses the parser to transform these definitions into commands that implement this behavior, or \"the how\" using the system's components.\n",
-        "Until very recently we supported two such DSLs, including industry-standard [AIML](http://docs.deeppavlov.ai/en/master/features/skills/aiml_skill.html), as well as [DSL](http://docs.deeppavlov.ai/en/master/features/skills/dsl_skill.html) designed by one of our partners, EORA.\n",
         "\n",
         "In this tutorial, you will learn how to use another industrial DSL, or, better said, set of DSLs, introduced by RASA.ai,\n",
         "to build simple goal-oriented chatbots using DeepPavlov's GO-bot.\n",
diff --git a/tests/test_aiml_skill.py b/tests/test_aiml_skill.py
deleted file mode 100644
index ffa08a3634..0000000000
--- a/tests/test_aiml_skill.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from logging import getLogger
-
-from deeppavlov import configs, build_model
-from deeppavlov.utils.pip_wrapper.pip_wrapper import install_from_config
-
-log = getLogger(__name__)
-
-
-class TestAIMLSkill:
-    def setup(self):
-        config_ref = configs.skills.aiml_skill
-        install_from_config(config_ref)
-        self.aiml_skill = build_model(config_ref, download=True)
-
-    def test_simple_reaction(self):
-        user_messages_sequence = [
-            "Hello",
-            "What s up?",
-            "Tell me a joke",
-            "Learn my pants are Red",
-            "LET DISCUSS MOVIES",
-            "Comedy movies are nice to watch",
-            "I LIKE WATCHING COMEDY!",
-            "Ok, goodbye"
-        ]
-
-        history_of_responses = []
-        for each_utt in user_messages_sequence:
-            log.info(f"User says: {each_utt}")
-            responses_batch, _, _ = self.aiml_skill([each_utt], [None])
-            log.info(f" Bot says: {responses_batch[0]}")
-            history_of_responses.append(responses_batch)
-
-        # check the first greeting message in 0th batch
-        assert "Well, hello!" in history_of_responses[0][0]
-        # check fifth message in 0th batch
-        assert "Yes movies" in history_of_responses[4][0]
diff --git a/tests/test_dsl_skill.py b/tests/test_dsl_skill.py
deleted file mode 100644
index 6a332b44ce..0000000000
--- a/tests/test_dsl_skill.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from logging import getLogger
-
-from deeppavlov import configs, build_model
-from deeppavlov.core.common.file import read_json
-from deeppavlov.skills.dsl_skill import DSLMeta
-from deeppavlov.utils.pip_wrapper.pip_wrapper import install_from_config
-
-log = getLogger(__name__)
-
-
-class DSLSkill(metaclass=DSLMeta):
-    @DSLMeta.handler(commands=["hello", "hi", "sup", "greetings"])
-    def greeting(context):
-        response = "Hello, my friend!"
-        confidence = 1.0
-        return response, confidence
-
-
-class StateSkill(metaclass=DSLMeta):
-    @DSLMeta.handler(commands=["hello", "hi", "sup", "greetings"])
-    def greeting(context):
-        response = "Hello, my friend!"
-        confidence = 1.0
-        context.current_state = "state1"
-        return response, confidence
-
-    @DSLMeta.handler(commands=["bye"],
-                     state="state1")
-    def bye(context):
-        response = "bb!"
-        confidence = 1.0
-        return response, confidence
-
-
-class ContextConditionSkill(metaclass=DSLMeta):
-    @DSLMeta.handler(commands=["hello", "hi", "sup", "greetings"],
-                     context_condition=lambda context: context.user_id != 1)
-    def greeting(context):
-        response = "Hello, my friend!"
-        confidence = 1.0
-        return response, confidence
-
-
-class TestDSLSkill:
-    def setup(self):
-        self.skill_config = read_json(configs.skills.dsl_skill)
-        install_from_config(self.skill_config)
-
-    def test_simple_skill(self):
-        user_messages_sequence = [
-            "Hello",
-            "Hi",
-            "Tell me a joke",
-            "Sup",
-            "Ok, goodbye"
-        ]
-
-        skill = build_model(self.skill_config, download=True)
-        history_of_responses = []
-        for user_id, each_utt in enumerate(user_messages_sequence):
-            log.info(f"User says: {each_utt}")
-            responses_batch = skill([each_utt], [user_id])
-            log.info(f"Bot says: {responses_batch[0]}")
-            history_of_responses.append(responses_batch)
-
-        # check the first greeting message in 0th batch
-        assert "Hello, my friend!" in history_of_responses[0][0]
-        # check the second greeting message in 0th batch
-        assert "Hello, my friend!" in history_of_responses[1][0]
-        # check `on_invalid_command`
-        assert "Sorry, I do not understand you" in history_of_responses[2][0]
-
-    def test_switch_state(self):
-        user_messages_sequence = [
-            "Hello",
-            "bye",
-            "bye"
-        ]
-
-        self.skill_config["chainer"]["pipe"][1]["class_name"] = "StateSkill"
-        skill = build_model(self.skill_config, download=True)
-
-        history_of_responses = []
-        for user_id, each_utt in enumerate(user_messages_sequence):
-            log.info(f"User says: {each_utt}")
-            responses_batch = skill([each_utt], [user_id % 2])
-            log.info(f"Bot says: {responses_batch[0]}")
-            history_of_responses.append(responses_batch)
-        assert "Hello, my friend!" in history_of_responses[0][0]
-        assert "Sorry, I do not understand you" in history_of_responses[1][0]
-        assert "bb!" in history_of_responses[2][0]
-
-    def test_context_condition(self):
-        user_messages_sequence = [
-            "Hello",
-            "Hi"
-        ]
-
-        self.skill_config["chainer"]["pipe"][1]["class_name"] = "ContextConditionSkill"
-        skill = build_model(self.skill_config, download=True)
-
-        history_of_responses = []
-        for user_id, each_utt in enumerate(user_messages_sequence):
-            log.info(f"User says: {each_utt}")
-            responses_batch = skill([each_utt], [user_id])
-            log.info(f"Bot says: {responses_batch[0]}")
-            history_of_responses.append(responses_batch)
-        assert "Hello, my friend!" in history_of_responses[0][0]
-        assert "Sorry, I do not understand you" in history_of_responses[1][0]
diff --git a/tests/test_rasa_skill.py b/tests/test_rasa_skill.py
deleted file mode 100644
index 7235ade7f3..0000000000
--- a/tests/test_rasa_skill.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from logging import getLogger
-
-from deeppavlov import configs, build_model
-from deeppavlov.utils.pip_wrapper.pip_wrapper import install_from_config
-
-log = getLogger(__name__)
-
-
-class TestRASASkill:
-    def setup(self):
-        config_ref = configs.skills.rasa_skill
-        install_from_config(config_ref)
-        self.rasa_skill = build_model(config_ref, download=True)
-
-    def test_simple_reaction(self):
-        user_messages_sequence = [
-            "Hello",
-            "What can you do?",
-            "Tell me a joke",
-            "Learn my pants are Red",
-            "LET DISCUSS MOVIES",
-            "Comedy movies are nice to watch",
-            "I LIKE WATCHING COMEDY!",
-            "Ok, goodbye"
-        ]
-
-        history_of_responses = []
-        for each_utt in user_messages_sequence:
-            log.info(f"User says: {each_utt}")
-            responses_batch, _ = self.rasa_skill([each_utt])
-            log.info(f" Bot says: {responses_batch[0]}")
-            history_of_responses.append(responses_batch)
-
-        print("history_of_responses:")
-        print(history_of_responses)
-        # # check the first greeting message in 0th batch
-        # assert "Hey! How are you?" in history_of_responses[0][0]
-        # # check second response message in 0th batch
-        # assert "I can chat with you. You can greet me" in history_of_responses[1][0]

From af607447774bdb9795ac2f9fe9a913427559edc9 Mon Sep 17 00:00:00 2001
From: Vasily <vaskonov@users.noreply.github.com>
Date: Thu, 16 Dec 2021 22:12:04 +0300
Subject: [PATCH 3/3] Feat/glue superglue update (#1508)

* Add wnli config

* Update copa config

* Fix path

* Fix record path

* Exclude train from evaluation

* Exclude train from evaluation

* add ranker

* update ranker

* feat: deeppavlov version update

Co-authored-by: Fedor Ignatov <ignatov.fedor@gmail.com>
Co-authored-by: slowwavesleep <44175589+slowwavesleep@users.noreply.github.com>
---
 deeppavlov/_meta.py                           |   2 +-
 .../classifiers/glue/glue_mnli_roberta.json   |   1 -
 .../glue/glue_rte_roberta_mnli.json           |   1 -
 .../classifiers/glue/glue_wnli_roberta.json   | 147 +++++++++++
 .../superglue/superglue_copa_roberta.json     | 236 +++++++++++-------
 .../superglue/superglue_record_roberta.json   |   2 +-
 .../regressors/translation_ranker.json        | 105 ++++++++
 7 files changed, 397 insertions(+), 97 deletions(-)
 create mode 100644 deeppavlov/configs/classifiers/glue/glue_wnli_roberta.json
 create mode 100644 deeppavlov/configs/regressors/translation_ranker.json

diff --git a/deeppavlov/_meta.py b/deeppavlov/_meta.py
index d3486ec284..2ee55f7d22 100644
--- a/deeppavlov/_meta.py
+++ b/deeppavlov/_meta.py
@@ -1,4 +1,4 @@
-__version__ = '0.17.1'
+__version__ = '0.17.2'
 __author__ = 'Neural Networks and Deep Learning lab, MIPT'
 __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.'
 __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot']
diff --git a/deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json b/deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json
index 7ff348e303..16b20476c0 100644
--- a/deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json
+++ b/deeppavlov/configs/classifiers/glue/glue_mnli_roberta.json
@@ -121,7 +121,6 @@
     "log_every_n_batches": 250,
     "show_examples": false,
     "evaluation_targets": [
-      "train",
       "valid"
     ],
     "class_name": "torch_trainer",
diff --git a/deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json b/deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json
index feb3f17ae5..6001c5cce7 100644
--- a/deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json
+++ b/deeppavlov/configs/classifiers/glue/glue_rte_roberta_mnli.json
@@ -121,7 +121,6 @@
     "log_every_n_epochs": 1,
     "show_examples": false,
     "evaluation_targets": [
-      "train",
       "valid"
     ],
     "class_name": "torch_trainer",
diff --git a/deeppavlov/configs/classifiers/glue/glue_wnli_roberta.json b/deeppavlov/configs/classifiers/glue/glue_wnli_roberta.json
new file mode 100644
index 0000000000..34b300c4b8
--- /dev/null
+++ b/deeppavlov/configs/classifiers/glue/glue_wnli_roberta.json
@@ -0,0 +1,147 @@
+{
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "BASE_MODEL": "roberta-large",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/classifiers/glue_wnli/{BASE_MODEL}"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/0.16/classifiers/glue_wnli_roberta.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  },
+  "dataset_reader": {
+    "class_name": "huggingface_dataset_reader",
+    "path": "glue",
+    "name": "wnli",
+    "train": "train",
+    "valid": "validation"
+  },
+  "dataset_iterator": {
+    "class_name": "huggingface_dataset_iterator",
+    "features": [
+      "sentence1",
+      "sentence2"
+    ],
+    "label": "label",
+    "seed": 42
+  },
+  "chainer": {
+    "in": [
+      "sentence1",
+      "sentence2"
+    ],
+    "in_y": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{BASE_MODEL}",
+        "do_lower_case": false,
+        "max_seq_length": 192,
+        "truncation": "longest_first",
+        "padding": "longest",
+        "in": [
+          "sentence1",
+          "sentence2"
+        ],
+        "out": [
+          "bert_features"
+        ]
+      },
+      {
+        "id": "classes_vocab",
+        "class_name": "simple_vocab",
+        "fit_on": [
+          "y"
+        ],
+        "save_path": "{MODEL_PATH}/classes.dict",
+        "load_path": "{MODEL_PATH}/classes.dict",
+        "in": [
+          "y"
+        ],
+        "out": [
+          "y_ids"
+        ]
+      },
+      {
+        "in": [
+          "y_ids"
+        ],
+        "out": [
+          "y_onehot"
+        ],
+        "class_name": "one_hotter",
+        "depth": "#classes_vocab.len",
+        "single_vector": true
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": "#classes_vocab.len",
+        "return_probas": true,
+        "pretrained_bert": "{BASE_MODEL}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 1e-05
+        },
+        "learning_rate_drop_patience": 3,
+        "learning_rate_drop_div": 2.0,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "y_ids"
+        ],
+        "out": [
+          "y_pred_probas"
+        ]
+      },
+      {
+        "in": [
+          "y_pred_probas"
+        ],
+        "out": [
+          "y_pred_ids"
+        ],
+        "class_name": "proba2labels",
+        "max_proba": true
+      },
+      {
+        "in": [
+          "y_pred_ids"
+        ],
+        "out": [
+          "y_pred_labels"
+        ],
+        "ref": "classes_vocab"
+      }
+    ],
+    "out": [
+      "y_pred_labels"
+    ]
+  },
+  "train": {
+    "batch_size": 24,
+    "metrics": [
+      "accuracy"
+    ],
+    "epochs": 1,
+    "val_every_n_batches": 250,
+    "log_every_n_batches": 250,
+    "show_examples": false,
+    "evaluation_targets": [
+      "train",
+      "valid"
+    ],
+    "class_name": "torch_trainer",
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "pytest_max_batches": 2
+  }
+}
diff --git a/deeppavlov/configs/classifiers/superglue/superglue_copa_roberta.json b/deeppavlov/configs/classifiers/superglue/superglue_copa_roberta.json
index 1a9fda443d..101f474412 100644
--- a/deeppavlov/configs/classifiers/superglue/superglue_copa_roberta.json
+++ b/deeppavlov/configs/classifiers/superglue/superglue_copa_roberta.json
@@ -1,97 +1,147 @@
 {
-  "dataset_reader": {
-    "class_name": "huggingface_dataset_reader",
-    "path": "super_glue",
-    "name": "copa",
-    "train": "train",
-    "valid": "validation",
-    "test": "test"
-  },
-  "dataset_iterator": {
-    "class_name": "huggingface_dataset_iterator",
-    "features": ["contexts", "choices"],
-    "label": "label",
-    "seed": 42
-  },
-  "chainer": {
-    "in": ["contexts_list", "choices_list"],
-    "in_y": ["y"],
-    "pipe": [
-      {
-        "class_name": "torch_transformers_multiplechoice_preprocessor",
-        "vocab_file": "{BASE_MODEL}",
-        "do_lower_case": false,
-        "max_seq_length": 64,
-        "in": ["contexts_list", "choices_list"],
-        "out": ["bert_features"]
-      },
-      {
-        "id": "classes_vocab",
-        "class_name": "simple_vocab",
-        "fit_on": ["y"],
-        "save_path": "{MODEL_PATH}/classes.dict",
-        "load_path": "{MODEL_PATH}/classes.dict",
-        "in": ["y"],
-        "out": ["y_ids"]
-      },
-      {
-        "in": ["y_ids"],
-        "out": ["y_onehot"],
-        "class_name": "one_hotter",
-        "depth": "#classes_vocab.len",
-        "single_vector": true
-      },
-      {
-        "class_name": "torch_transformers_multiplechoice",
-        "n_classes": "#classes_vocab.len",
-        "return_probas": true,
-        "pretrained_bert": "{BASE_MODEL}",
-        "save_path": "{MODEL_PATH}/model",
-        "load_path": "{MODEL_PATH}/model",
-        "optimizer": "AdamW",
-        "optimizer_parameters": {
-          "lr": 2e-05
+    "dataset_reader": {
+        "class_name": "huggingface_dataset_reader",
+        "path": "super_glue",
+        "name": "copa",
+        "train": "train",
+        "valid": "validation",
+        "test": "test"
+    },
+    "dataset_iterator": {
+        "class_name": "huggingface_dataset_iterator",
+        "features": [
+            "contexts",
+            "choices"
+        ],
+        "label": "label",
+        "seed": 42
+    },
+    "chainer": {
+        "in": [
+            "contexts_list",
+            "choices_list"
+        ],
+        "in_y": [
+            "y"
+        ],
+        "pipe": [
+            {
+                "class_name": "torch_transformers_multiplechoice_preprocessor",
+                "vocab_file": "{BASE_MODEL}",
+                "do_lower_case": false,
+                "max_seq_length": 64,
+                "in": [
+                    "contexts_list",
+                    "choices_list"
+                ],
+                "out": [
+                    "bert_features"
+                ]
+            },
+            {
+                "id": "classes_vocab",
+                "class_name": "simple_vocab",
+                "fit_on": [
+                    "y"
+                ],
+                "save_path": "{MODEL_PATH}/classes.dict",
+                "load_path": "{MODEL_PATH}/classes.dict",
+                "in": [
+                    "y"
+                ],
+                "out": [
+                    "y_ids"
+                ]
+            },
+            {
+                "in": [
+                    "y_ids"
+                ],
+                "out": [
+                    "y_onehot"
+                ],
+                "class_name": "one_hotter",
+                "depth": "#classes_vocab.len",
+                "single_vector": true
+            },
+            {
+                "class_name": "torch_transformers_multiplechoice",
+                "n_classes": "#classes_vocab.len",
+                "return_probas": true,
+                "pretrained_bert": "{BASE_MODEL}",
+                "save_path": "{MODEL_PATH}/model",
+                "load_path": "{MODEL_PATH}/model",
+                "optimizer": "AdamW",
+                "optimizer_parameters": {
+                    "lr": 2e-05
+                },
+                "learning_rate_drop_patience": 3,
+                "learning_rate_drop_div": 2.0,
+                "in": [
+                    "bert_features"
+                ],
+                "in_y": [
+                    "y_ids"
+                ],
+                "out": [
+                    "y_pred_probas"
+                ]
+            },
+            {
+                "in": [
+                    "y_pred_probas"
+                ],
+                "out": [
+                    "y_pred_ids"
+                ],
+                "class_name": "proba2labels",
+                "max_proba": true
+            },
+            {
+                "in": [
+                    "y_pred_ids"
+                ],
+                "out": [
+                    "y_pred_labels"
+                ],
+                "ref": "classes_vocab"
+            }
+        ],
+        "out": [
+            "y_pred_labels"
+        ]
+    },
+    "train": {
+        "batch_size": 16,
+        "metrics": [
+            "accuracy"
+        ],
+        "validation_patience": 10,
+        "val_every_n_epochs": 1,
+        "log_every_n_epochs": 1,
+        "show_examples": false,
+        "evaluation_targets": [
+            "train",
+            "valid"
+        ],
+        "class_name": "torch_trainer",
+        "tensorboard_log_dir": "{MODEL_PATH}/",
+        "pytest_max_batches": 2,
+        "pytest_batch_size": 2
+    },
+    "metadata": {
+        "variables": {
+            "BASE_MODEL": "roberta-large",
+            "ROOT_PATH": "~/.deeppavlov",
+            "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+            "MODELS_PATH": "{ROOT_PATH}/models",
+            "MODEL_PATH": "{MODELS_PATH}/classifiers/superglue_copa_{BASE_MODEL}"
         },
-        "learning_rate_drop_patience": 3,
-        "learning_rate_drop_div": 2.0,
-        "in": ["bert_features"],
-        "in_y": ["y_ids"],
-        "out": ["y_pred_probas"]
-      },
-      {
-        "in": ["y_pred_probas"],
-        "out": ["y_pred_ids"],
-        "class_name": "proba2labels",
-        "max_proba": true
-      },
-      {
-        "in": ["y_pred_ids"],
-        "out": ["y_pred_labels"],
-        "ref": "classes_vocab"
-      }
-    ],
-    "out": ["y_pred_labels"]
-  },
-  "train": {
-    "batch_size": 16,
-    "metrics": ["accuracy"],
-    "validation_patience": 10,
-    "val_every_n_epochs": 1,
-    "log_every_n_epochs": 1,
-    "show_examples": false,
-    "evaluation_targets": ["train", "valid"],
-    "class_name": "torch_trainer",
-    "tensorboard_log_dir": "{MODEL_PATH}/",
-    "pytest_max_batches": 2,
-    "pytest_batch_size": 2
-  },
-  "metadata": {
-    "variables": {
-      "BASE_MODEL": "roberta-large",
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models",
-      "MODEL_PATH": "{MODELS_PATH}/classifiers/superglue_copa_{BASE_MODEL}"
+        "download": [
+            {
+                "url": "http://files.deeppavlov.ai/0.17/classifiers/superglue/superglue_copa_roberta.tar.gz",
+                "subdir": "{MODELS_PATH}"
+            }
+        ]
     }
-  }
 }
diff --git a/deeppavlov/configs/classifiers/superglue/superglue_record_roberta.json b/deeppavlov/configs/classifiers/superglue/superglue_record_roberta.json
index e537a098f3..c21bcf193e 100644
--- a/deeppavlov/configs/classifiers/superglue/superglue_record_roberta.json
+++ b/deeppavlov/configs/classifiers/superglue/superglue_record_roberta.json
@@ -13,7 +13,7 @@
     "download": [
       {
         "url": "http://files.deeppavlov.ai/0.17/classifiers/superglue/superglue_record_roberta.tar.gz",
-        "subdir": "{MODEL_PATH}"
+        "subdir": "{MODELS_PATH}"
       }
     ]
   },
diff --git a/deeppavlov/configs/regressors/translation_ranker.json b/deeppavlov/configs/regressors/translation_ranker.json
new file mode 100644
index 0000000000..161a6ad2c5
--- /dev/null
+++ b/deeppavlov/configs/regressors/translation_ranker.json
@@ -0,0 +1,105 @@
+{
+  "metadata":
+  {
+    "variables": {
+      "BASE_MODEL": "cointegrated/LaBSE-en-ru",
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/classifiers/ranker_labse",
+      "SEED": 42
+    },
+    "download": [
+	{
+		"url": "http://files.deeppavlov.ai/v1/tmp/translation_ranker.tar.gz",
+		"subdir": "{MODELS_PATH}"
+	}
+    ]
+  },
+    "dataset_iterator": {
+      "class_name": "huggingface_dataset_iterator",
+      "features": [
+        "source",
+        "hypothesis"
+      ],
+    "label": "agg_score",
+    "seed": "{SEED}",
+    "use_label_name": false
+  },
+    "chainer": {
+      "in": [
+        "source",
+        "hypothesis"
+      ],
+      "in_y": [
+        "score"
+      ],
+      "pipe": [
+        {
+          "class_name": "torch_transformers_preprocessor",
+          "vocab_file": "{BASE_MODEL}",
+          "do_lower_case": false,
+          "max_seq_length": 256,
+          "in": [
+          "source",
+          "hypothesis"
+          ],
+          "out": [
+            "bert_features"
+          ]
+        },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": 1,
+        "return_probas": false,
+        "pretrained_bert": "{BASE_MODEL}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "optimizer": "AdamW",
+        "optimizer_parameters": {
+          "lr": 2e-06,
+          "weight_decay": 0.1
+        },
+        "learning_rate_drop_patience": 3,
+        "learning_rate_drop_div": 2.0,
+        "in": [
+          "bert_features"
+        ],
+        "in_y": [
+          "score"
+        ],
+        "out": [
+          "pred_score"
+        ]
+      }
+      ],
+      "out": [
+        "pred_score"
+      ]
+    },
+    "train": {
+    "batch_size": 32,
+    "metrics": [
+      {
+        "name": "mean_squared_error",
+        "inputs": [
+          "score",
+          "pred_score"
+        ]
+      }
+    ],
+    "validation_patience": 10,
+    "val_every_n_epochs": 1,
+    "log_every_n_epochs": 1,
+    "show_examples": false,
+    "class_name": "torch_trainer",
+    "evaluation_targets": [
+      "train",
+      "valid"
+    ],
+    "metric_optimization": "minimize",
+    "tensorboard_log_dir": "{MODEL_PATH}/",
+    "pytest_max_batches": 2,
+    "pytest_batch_size": 2
+  }
+}