speech recognition dynamic version (openvinotoolkit#3237)

* speech recognition dynamic version * fix flake errors * update README and add dynamic flag * update flag and README * update dynamic cases
yangwang201911 · Feb 20, 2022 · 00d8a33 · 00d8a33
1 parent 2501256
commit 00d8a33
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 14 deletions.
diff --git a/demos/speech_recognition_wav2vec_demo/python/README.md b/demos/speech_recognition_wav2vec_demo/python/README.md
@@ -34,7 +34,7 @@ omz_converter --list models.lst
 Run the application with `-h` option to see help message.
 
 ```
-usage: speech_recognition_wav2vec_demo.py [-h] -m MODEL -i INPUT [-d DEVICE]
+usage: speech_recognition_wav2vec_demo.py [-h] -m MODEL -i INPUT [-d DEVICE] [--vocab VOCAB] [--dynamic_shape]
 
 optional arguments:
   -h, --help            Show this help message and exit.
@@ -43,11 +43,12 @@ optional arguments:
   -i INPUT, --input INPUT
                         Required. Path to an audio file in WAV PCM 16 kHz mono format.
   -d DEVICE, --device DEVICE
-                        Optional. Specify the target device to infer on, for
-                        example: CPU, GPU, HDDL, MYRIAD or HETERO. The
-                        demo will look for a suitable IE plugin for this
-                        device. Default value is CPU.
-  --vocab VOCAB         Optional. Path to an .json file with model encoding vocabulary.
+                        Optional. Specify the target device to infer on, for example: CPU, GPU, HDDL, MYRIAD or
+                        HETERO. The demo will look for a suitable IE plugin for this device. Default value is
+                        CPU.
+  --vocab VOCAB         Optional. Path to an .json file with encoding vocabulary.
+  --dynamic_shape       Optional. Using dynamic shapes for inputs and outputs of model.
+
 ```
 
 The typical command line is:

diff --git a/demos/speech_recognition_wav2vec_demo/python/speech_recognition_wav2vec_demo.py b/demos/speech_recognition_wav2vec_demo/python/speech_recognition_wav2vec_demo.py
@@ -40,6 +40,8 @@ def build_argparser():
                              "CPU, GPU, HDDL, MYRIAD or HETERO. "
                              "The demo will look for a suitable IE plugin for this device. Default value is CPU.")
     parser.add_argument('--vocab', help='Optional. Path to an .json file with encoding vocabulary.')
+    parser.add_argument('--dynamic_shape', action='store_true',
+                        help='Optional. Using dynamic shapes for inputs of model.')
     return parser
 
 
@@ -51,23 +53,26 @@ class Wav2Vec:
     words_delimiter = '|'
     pad_token = '<pad>'
 
-    def __init__(self, core, model_path, input_shape, device, vocab_file):
+    def __init__(self, core, model_path, input_shape, device, vocab_file, dynamic_flag):
         log.info('Reading model {}'.format(model_path))
         model = core.read_model(model_path)
         if len(model.inputs) != 1:
             raise RuntimeError('Wav2Vec must have one input')
         self.input_tensor_name = model.inputs[0].get_any_name()
-        model_input_shape = model.inputs[0].shape
+        model_input_shape = model.inputs[0].partial_shape
         if len(model_input_shape) != 2:
             raise RuntimeError('Wav2Vec input must be 2-dimensional')
         if len(model.outputs) != 1:
             raise RuntimeError('Wav2Vec must have one output')
-        model_output_shape = model.outputs[0].shape
+        model_output_shape = model.outputs[0].partial_shape
         if len(model_output_shape) != 3:
             raise RuntimeError('Wav2Vec output must be 3-dimensional')
         if model_output_shape[2] != len(self.alphabet):
             raise RuntimeError(f'Wav2Vec output third dimension size must be {len(self.alphabet)}')
-        model.reshape({self.input_tensor_name: PartialShape(input_shape)})
+        if not dynamic_flag:
+            model.reshape({self.input_tensor_name: PartialShape(input_shape)})
+        elif not model.is_dynamic():
+            model.reshape({self.input_tensor_name: PartialShape((-1, -1))})
         compiled_model = core.compile_model(model, device)
         self.output_tensor = compiled_model.outputs[0]
         self.infer_request = compiled_model.create_infer_request()
@@ -124,7 +129,7 @@ def main():
     log.info('\tbuild: {}'.format(get_version()))
     core = Core()
 
-    model = Wav2Vec(core, args.model, audio.shape, args.device, args.vocab)
+    model = Wav2Vec(core, args.model, audio.shape, args.device, args.vocab, args.dynamic_shape)
     normalized_audio = model.preprocess(audio)
     character_probs = model.infer(normalized_audio)
     transcription = model.decode(character_probs)

diff --git a/models/public/wav2vec2-base/README.md b/models/public/wav2vec2-base/README.md
@@ -25,11 +25,13 @@ For details please also check [repository](https://github.com/pytorch/fairseq/tr
 
 #### Original model
 
-Normalized audio signal, name - `inputs`,  shape - `1, 30480`, format is `B, N`, where:
+Normalized audio signal, name - `inputs`,  shape - `B, N`, format is `B, N`, where:
 
 - `B` - batch size
 - `N` - sequence length
 
+Model is dynamic and can working with different shapes of input.
+
 **NOTE**: Model expects 16-bit, 16 kHz, mono-channel WAVE audio as input data.
 
 #### Converted model
@@ -40,12 +42,13 @@ The converted model has the same parameters as the original model.
 
 #### Original model
 
-Per-token probabilities (after LogSoftmax) for every symbol in the alphabet, name - `logits`,  shape - `1, 95, 32`, output data format is `B, N, C`, where:
+Per-token probabilities (after LogSoftmax) for every symbol in the alphabet, name - `logits`,  shape - `B, N, 32`, output data format is `B, N, C`, where:
 
 - `B` - batch size
 - `N` - number of recognized tokens
 - `C` - alphabet size
 
+`B` and `N` dimensions can take different values, because model is dynamic. Alphabet size `C` is static and equals 32.
 Model alphabet: "[pad]", "[s]", "[/s]", "[unk]", "|", "E", "T", "A", "O", "N", "I", "H", "S", "R", "D", "L", "U", "M", "W", "C", "F", "G", "Y", "P", "B", "V", "K", "'", "X", "J", "Q", "Z", where:
 
 - `[pad]` - padding token used as CTC-blank label

diff --git a/models/public/wav2vec2-base/model.yml b/models/public/wav2vec2-base/model.yml
@@ -107,7 +107,6 @@ conversion_to_onnx_args:
   - '--conversion-param=dynamic_axes={"inputs": {0: "batch_size", 1: "sequence_len"},
     "logits": {0: "batch_size", 1: "sequence_len"}}'
 model_optimizer_args:
-  - --input_shape=[1,30480]
   - --input=inputs
   - --layout=inputs(NS)
   - --input_model=$conv_dir/wav2vec2-base.onnx