openvinotoolkit · apaniukov · Mar 29, 2022 · Mar 29, 2022 · Mar 29, 2022 · Mar 30, 2022
diff --git a/automation/bom/image_BOM.txt b/automation/bom/image_BOM.txt
@@ -506,6 +506,9 @@ wb/main/jupyter_notebooks/cell_templates/summary_docs_cell.jinja
 wb/main/jupyter_notebooks/cell_templates/tokenize_dataset_code_cell.jinja
 wb/main/jupyter_notebooks/cell_templates/tokenize_dataset_docs_cell.jinja
 wb/main/jupyter_notebooks/cell_templates/tokenizer_parameters_code_cell.jinja
+wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_code_cell.jinja
+wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_docs_cell.jinja
+wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_result_docs_cell.jinja
 wb/main/jupyter_notebooks/cell_templates/validate_ir_model_code_cell.jinja
 wb/main/jupyter_notebooks/cell_templates/validate_ir_model_docs_cell.jinja
 wb/main/jupyter_notebooks/cli_tools_options.py

diff --git a/wb/main/enumerates.py b/wb/main/enumerates.py
@@ -285,6 +285,16 @@ class ModelSourceEnum(enum.Enum):
     ir = 'ir'
     huggingface = 'huggingface'
 
+    def get_name(self) -> str:
+        if self.value == "omz":
+            return "OMZ"
+        if self.value == "original":
+            return "Original"
+        if self.value == "ir":
+            return "IR"
+        if self.value == "huggingface":
+            return "Hugging Face Hub"
+
 
 class TargetOSEnum(enum.Enum):
     ubuntu18 = 'ubuntu18'

diff --git a/wb/main/jupyter_notebooks/cell_template_contexts.py b/wb/main/jupyter_notebooks/cell_template_contexts.py
@@ -31,6 +31,7 @@ class IntroCellTemplateContext(TypedDict):
     project_model_task_type: str
     project_model_framework: str
     project_model_precisions: str
+    project_model_source: str
     has_tokenizer_section: bool
     has_accuracy_checker_section: bool
     has_int8_calibration_section: bool
@@ -79,12 +80,17 @@ class ProfilingCodeCellTemplateContext(PythonToolCodeCellTemplateContext):
     has_tokenizer_section: bool
 
 
+class ProfilingDocsCellTemplateContext(PythonToolCodeCellTemplateContext):
+    is_nlp: bool
+
+
 class TokenizerParametersTemplateContext(TypedDict):
     tokenizer_path: Optional[str]
     dataset_path: str
     batch: Optional[int]
     streams: Optional[int]
 
+
 class AccuracyDocsCellTemplateContext(TypedDict):
     yaml_config_path: str
 
@@ -107,3 +113,7 @@ class Int8OptimizationCodeCellTemplateContext(Int8OptimizationDocsCellTemplateCo
 
 class InstallRequirementsCodeCellTemplateContext(TypedDict):
     requirements_file: str
+
+
+class TransformersONNXCodeCellTemplateContext(TypedDict):
+    model_checkpoint: str
diff --git a/wb/main/jupyter_notebooks/cell_templates/intro_docs_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/intro_docs_cell.jinja
@@ -4,9 +4,9 @@
 
 The purpose of this tutorial is to guide you through the stages of working with a model to optimize it and prepare for production using OpenVINO toolkit. The model used in the tutorial was imported from the DL Workbench project and has the following characteristics:
 
-| Model Name | Domain |  Task Type | Framework | Precisions |
-| :---: | :---: | :---: | :---: | :---: |
-| {{ project_model_name }} | {{ project_model_domain }} | {{ project_model_task_type | replace('_', ' ') | title }} | {{ SupportedFrameworksEnum.get_name(project_model_framework) }} | {{ project_model_precisions }} |
+| Model Name | Domain |  Task Type | Framework | Precisions | Source |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| {{ project_model_name }} | {{ project_model_domain }} | {{ project_model_task_type | replace('_', ' ') | title }} | {{ SupportedFrameworksEnum.get_name(project_model_framework) }} | {{ project_model_precisions }} | {{ project_model_source }} |
 
 {% if project_model_task_type == TaskEnum.object_detection.value -%}
     This model is trained to solve Object Detection task. The goal of Object Detection is to recognize instances of object classes (for example: people, cars, animals) and describe the locations of each detected object in the image using a bounding box.
@@ -26,6 +26,8 @@ The purpose of this tutorial is to guide you through the stages of working with
     This model is trained to solve Facial Landmark Detection task. The goal of Facial Landmark Detection is to detect key landmarks of a face, such as eyes, nose, and mouth.
 {% elif project_model_task_type == TaskEnum.face_recognition.value -%}
     This model is trained to solve Face Recognition task. The goal of Face Recognition is to identify a person based on an image of their face.
+{% elif project_model_task_type == TaskEnum.text_classification.value -%}
+    This model is trained to solve Text Classification task. The goal of Text Classification task is to assign a sentence or document an appropriate category (label or class).
 {% else -%}
     This model is trained to solve Generic task. The goal of Generic task is to enable you to perform different kinds of experiments with your data.
 {% endif %}

diff --git a/wb/main/jupyter_notebooks/cell_templates/model_optimizer_generic_arguments_docs_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/model_optimizer_generic_arguments_docs_cell.jinja
@@ -23,7 +23,7 @@
 `--silent` | Prevent any output messages except those that correspond to log level equals ERROR, that can be set with the following option: --log_level. By default, log level is already ERROR.
 `--freeze_placeholder_with_value` | Replaces input layer with constant node with provided value, for example: "node_name->True". It will be DEPRECATED in future releases. Use --input option to specify a value for freezing.
 `--generate_deprecated_IR_V7` | Force to generate deprecated IR V7 with layers from old IR specification.
-`--static_shape` | Enables IR generation for fixed input shape (folding `ShapeOf` operations and shape-calculating sub-graphs to `Constant`). Changing model input shape using the Inference Engine API in runtime may fail for such an IR.
+`--static_shape` | Enables IR generation for fixed input shape (folding `ShapeOf` operations and shape-calculating sub-graphs to `Constant`). Changing model input shape using the OpenVINO API in runtime may fail for such an IR.
 `--keep_shape_ops` | The option is ignored. Expected behavior is enabled by default.
 `--disable_weights_compression` | Disable compression and store weights with original precision.
 `--progress` | Enable model conversion progress display.

diff --git a/wb/main/jupyter_notebooks/cell_templates/obtain_model_docs_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/obtain_model_docs_cell.jinja
@@ -13,9 +13,21 @@ In this tutorial we are working with the following model:
 
 - Model Name: **{{ project_model_name }}**
 
-- Model Source: {{ 'Open Model Zoo' if project_model_source == ModelSourceEnum.omz.value else 'User provided model' }}
+- Model Source:
+{% if project_model_source == ModelSourceEnum.huggingface.value %}
+Hugging Face Hub
+{% elif project_model_source == ModelSourceEnum.omz.value %}
+Open Model Zoo
+{% else %}
+User provided model
+{% endif %}
 
-- Model Framework: {{ SupportedFrameworksEnum.get_name(project_model_framework) }}
+- Model Framework:
+{% if project_model_source == ModelSourceEnum.huggingface.value %}
+{{ SupportedFrameworksEnum.get_name("pytorch") }}
+{% else %}
+{{ SupportedFrameworksEnum.get_name(project_model_framework) }}
+{% endif %}
 
 - Steps to obtain IR:
 {% if project_model_source == ModelSourceEnum.ir.value %}
@@ -26,6 +38,8 @@ No conversion to IR required. Download the model with the Model Downloader and p
 {%      else %}
 Download the model with Model Downloader and then convert it to IR format with Model Converter.
 {%      endif %}
+{% elif project_model_source == ModelSourceEnum.huggingface.value %}
+Your original model is in the PyTorch format. Use `transformers.onnx` CLI tool to convert it to ONNX, then convert the model to the IR format with Model Optimizer.
 {% elif project_model_source == ModelSourceEnum.original.value %}
 Your original model is in one of the supported frameworks. Convert model to IR format with Model Optimizer.
 {% endif %}
diff --git a/wb/main/jupyter_notebooks/cell_templates/profiling_docs_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/profiling_docs_cell.jinja
@@ -2,11 +2,16 @@
 
 ### Motivation
 
-Model performance is the amount of information that your model can process per unit of time. In Computer Vision model performance defines how fast your model can process a number of images and generate the desired output. Usually it is measured in Frames Per Second (FPS).
+Model performance is the amount of information that your model can process per unit of time.
+{% if is_nlp %}In NLP, model performance defines how fast your model can process a number of text samples and generate the desired output. Usually it is measured in Samples Per Second (SPS).
+{% else %}
+In Computer Vision model performance defines how fast your model can process a number of images and generate the desired output. Usually it is measured in Frames Per Second (FPS).
+{% endif %}
 OpenVINO uses the term Inference to denote the stage of a single network execution.
 Inference is the stage in which a trained model is used to infer/predict the testing samples and comprises of a similar forward pass as training to predict the values.
 
 In OpenVINO toolkit inference is performed by Benchmark Tool.
+{% if is_nlp %} Note that Benchmark Tool was initially developed for Computer Vision (CV) use case and reports inference results in Frames Per Second (FPS in CV = SPS in NLP). {% endif %}
 
 ### OpenVINO Tool: Benchmark Tool
 

diff --git a/wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_code_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_code_cell.jinja
@@ -0,0 +1,7 @@
+{% include 'autogenerated_note_code.jinja' %}
+
+
+!python -m transformers.onnx \
+    --model={{ model_checkpoint }} \
+    --feature sequence-classification \
+    onnx
diff --git a/wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_docs_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_docs_cell.jinja
@@ -0,0 +1,32 @@
+### Get ONNX model from Hugging Face Hub
+
+#### Motivation
+
+Most of the models on the Hugging Face Hub are stored in the PyTorch format.
+To get an Intermediate Representation (IR), preferred model format to work with OpenVINO, you need to convert the model to ONNX format.
+For that, use `transformers.onnx` CLI tool from the Transformers library, which is not part of OpenVINO.
+
+#### Main usage
+
+`transformers.onnx` tool takes the name of the model repository and the model task from Hugging Face Hub.
+Then it downloads all necessary files, converts the model to ONNX format, and checks the resulting model.
+
+#### Description
+
+`transformers.onnx` will execute the following steps:
+
+1. Download the model files and tokenizer files from the Hugging Face Hub.
+1. Generate the dummy input with the tokenizer and pass it to the model to trace the model execution graph.
+1. Use the execution graph to generate an ONNX model.
+1. Check that the result model output is close to the original model output.
+
+To learn more about this CLI tool, read the [documentation](https://huggingface.co/docs/transformers/main/en/serialization#onnx).
+
+#### Used Command-Line Arguments
+
+<details>
+<summary>View transformers.onnx command-line arguments</summary>
+
+{{ CLIToolEnum.transformers_onnx.format_to_markdown_table() | safe }}
+
+</details>
diff --git a/wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_result_docs_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_result_docs_cell.jinja
@@ -0,0 +1,2 @@
+As a result, we have converted PyTorch model to ONNX format with transformers.onnx tool.
+You can find the model in the `onnx` folder in the root directory.
diff --git a/wb/main/jupyter_notebooks/cell_templates/validate_ir_model_code_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/validate_ir_model_code_cell.jinja
@@ -3,9 +3,9 @@
 
 from openvino.runtime import Core
 
-# Create an Inference Engine instance
+# Create an OpenVINO Core instance
 core = Core()
 
 # Read the network from IR files
 model = core.read_model(model=model_xml_file_path, weights=model_bin_file_path)
-print(f'Model {model.friendly_name} was successfully loaded to Inference Engine.')
+print(f'Model {model.friendly_name} was successfully loaded to OpenVINO.')
diff --git a/wb/main/jupyter_notebooks/cell_templates/validate_ir_model_docs_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/validate_ir_model_docs_cell.jinja
@@ -1 +1 @@
-Let's check that your model is a valid OpenVINO IR file. To do that, we use OpenVINO Inference Engine Python* API. Refer to the [documentation](https://docs.openvino.ai/latest/openvino_inference_engine_ie_bridges_python_docs_api_overview.html) for more details.
+Let's check that your model is a valid OpenVINO IR file. To do that, we use OpenVINO Python* API. Refer to the [documentation](https://docs.openvino.ai/latest/api/ie_python_api/api.html) for more details.
diff --git a/wb/main/jupyter_notebooks/cli_tools_options.py b/wb/main/jupyter_notebooks/cli_tools_options.py
@@ -151,6 +151,9 @@ class CLIToolEnum(enum.Enum):
     pot = CLITool(path='pot',
                   displayed_options={'-c', '--output-dir', '--direct-dump'})
 
+    transformers_onnx = CLITool(path='python -m transformers.onnx',
+                                displayed_options=set())
+
     def format_to_markdown_table(self) -> str:
         return CLIToolHelpToMarkdownTableFormatter.format(cli_tool=self)
 
@@ -185,17 +188,22 @@ def initialize(self):
             dump_data = CLIToolsOptionsDumper.deserialize()
             tool_options_map = dump_data.get('options_map')
             dumped_wb_version = dump_data.get('wb_version')
-            if current_wb_version == dumped_wb_version and tool_options_map and all(
-                    [value in CLIToolEnum.keys() for _, value in enumerate(tool_options_map)]):
+            new_tools = {tool.name for tool in CLIToolEnum} - set(tool_options_map)
+            if (
+                current_wb_version == dumped_wb_version
+                and tool_options_map
+                and all(value in CLIToolEnum.keys() for value in tool_options_map)
+                and not new_tools
+            ):
                 self._options_map = tool_options_map
                 return
         tool_processes = []
         processes_tool_options_map = Manager().dict()
         for tool_enum in CLIToolEnum:
-            tool_name = tool_enum.name
+            tool_name = tool_enum.name.split()
             tool_path = tool_enum.value['path']
             process = Process(target=self._update_parsed_tool_options_map,
-                              args=(tool_name, tool_path, processes_tool_options_map))
+                              args=(*tool_name, tool_path, processes_tool_options_map))
             tool_processes.append(process)
             process.start()
         for process in tool_processes:

diff --git a/wb/main/jupyter_notebooks/jupyter_notebook_cell.py b/wb/main/jupyter_notebooks/jupyter_notebook_cell.py
@@ -66,6 +66,9 @@ class NotebookCellIds(enum.Enum):
     tokenize_dataset_docs = 'tokenize_dataset_docs'
     tokenize_dataset_code = 'tokenize_dataset_code'
     tokenizer_parameters_code = 'tokenizer_parameters_code'
+    transformers_onnx_converter_docs = 'transformers_onnx_converter_docs'
+    transformers_onnx_converter_result_docs = 'transformers_onnx_converter_result_docs'
+    transformers_onnx_converter_code = 'transformers_onnx_converter_code'
 
 
 class NotebookCellConfig:
@@ -144,6 +147,24 @@ class NotebookCells:
         cell_type=NotebookCellTypes.markdown,
         template_filename='model_converter_result_docs_cell.jinja')
 
+    transformers_onnx_converter_docs = NotebookCellConfig(
+        cell_id=NotebookCellIds.transformers_onnx_converter_docs,
+        cell_type=NotebookCellTypes.markdown,
+        template_filename='transformers_onnx_converter_docs_cell.jinja'
+    )
+
+    transformers_onnx_converter_result_docs = NotebookCellConfig(
+        cell_id=NotebookCellIds.transformers_onnx_converter_result_docs,
+        cell_type=NotebookCellTypes.markdown,
+        template_filename='transformers_onnx_converter_result_docs_cell.jinja'
+    )
+
+    transformers_onnx_converter_code = NotebookCellConfig(
+        cell_id=NotebookCellIds.transformers_onnx_converter_code,
+        cell_type=NotebookCellTypes.code,
+        template_filename='transformers_onnx_converter_code_cell.jinja'
+    )
+
     model_optimizer_docs = NotebookCellConfig(
         cell_id=NotebookCellIds.model_optimizer_docs,
         cell_type=NotebookCellTypes.markdown,
@@ -179,7 +200,7 @@ class NotebookCells:
         cell_type=NotebookCellTypes.code,
         template_filename='validate_ir_model_code_cell.jinja')
 
-    tokenizer_parameters_code_code = NotebookCellConfig(
+    tokenizer_parameters_code = NotebookCellConfig(
         cell_id=NotebookCellIds.tokenizer_parameters_code,
         cell_type=NotebookCellTypes.code,
         template_filename='tokenizer_parameters_code_cell.jinja')

diff --git a/wb/main/jupyter_notebooks/notebook_template_creator.py b/wb/main/jupyter_notebooks/notebook_template_creator.py
@@ -48,7 +48,7 @@ class NotebookTemplateCreator:
         ],
         model_tokenizer=[
             NotebookCells.load_tokenizer_docs,
-            NotebookCells.tokenizer_parameters_code_code,
+            NotebookCells.tokenizer_parameters_code,
             NotebookCells.load_tokenizer_code,
             NotebookCells.tokenize_dataset_docs,
             NotebookCells.tokenize_dataset_code,
@@ -98,6 +98,17 @@ def _obtain_model_section_cells(self) -> List[NotebookCellConfig]:
                 ])
             obtain_model_cells.append(NotebookCells.obtain_model_result_docs)
             return obtain_model_cells
+        if self._original_model_source == ModelSourceEnum.huggingface:
+            return [
+                NotebookCells.obtain_model_docs,
+                NotebookCells.transformers_onnx_converter_docs,
+                NotebookCells.transformers_onnx_converter_code,
+                NotebookCells.transformers_onnx_converter_result_docs,
+                NotebookCells.model_optimizer_docs,
+                NotebookCells.model_optimizer_code,
+                NotebookCells.model_optimizer_result_docs,
+                NotebookCells.obtain_model_result_docs,
+            ]
         if self._original_model_source == ModelSourceEnum.original:
             return [
                 NotebookCells.obtain_model_docs,

diff --git a/wb/main/jupyter_notebooks/resources/requirements_nlp.txt b/wb/main/jupyter_notebooks/resources/requirements_nlp.txt
@@ -2,5 +2,9 @@ PyYAML==5.4.1
 numpy==1.19.5
 progress==1.6
 
-transformers==4.12.2
+transformers[onnx]==4.16.2
+
+# tokenization dependencies
 sentencepiece==0.1.96
+fugashi==1.1.2 # ja tokenizers
+ipadic==1.0.0 # ja tokenizers
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		As a result, we have converted PyTorch model to ONNX format with transformers.onnx tool.
		You can find the model in the `onnx` folder in the root directory.
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		Let's check that your model is a valid OpenVINO IR file. To do that, we use OpenVINO Inference Engine Python* API. Refer to the [documentation](https://docs.openvino.ai/latest/openvino_inference_engine_ie_bridges_python_docs_api_overview.html) for more details.
		Let's check that your model is a valid OpenVINO IR file. To do that, we use OpenVINO Python* API. Refer to the [documentation](https://docs.openvino.ai/latest/api/ie_python_api/api.html) for more details.