Whisper pipeline: update readme (openvinotoolkit#1045)

Co-authored-by: Ilya Lavrenov <[email protected]>
sungeunk · Oct 24, 2024 · 6dd8b14 · 6dd8b14
1 parent 6a4ba7f
commit 6dd8b14
Show file tree

Hide file tree

Showing 11 changed files with 163 additions and 138 deletions.
diff --git a/.gitignore b/.gitignore
@@ -35,3 +35,4 @@ CMakeUserPresets.json
 *.pyc
 __pycache__
 .py-build-cmake_cache
+*.egg-info
diff --git a/README.md b/README.md
@@ -230,39 +230,18 @@ optimum-cli export openvino --trust-remote-code --model openai/whisper-base whis
 NOTE: This sample is a simplified version of the full sample that is available [here](./samples/python/whisper_speech_recognition/whisper_speech_recognition.py)
 
 ```python
-import argparse
 import openvino_genai
 import librosa
 
+
 def read_wav(filepath):
     raw_speech, samplerate = librosa.load(filepath, sr=16000)
     return raw_speech.tolist()
 
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("model_dir")
-    parser.add_argument("wav_file_path")
-    args = parser.parse_args()
-
-    device = 'CPU'  # GPU can be used as well
-    pipe = openvino_genai.WhisperPipeline(args.model_dir, device)
-
-    raw_speech = read_wav(args.wav_file_path)
-
-    def streamer(word: str) -> bool:
-        print(word, end="")
-        return False
-
-    pipe.generate(
-        raw_speech,
-        max_new_tokens=100,
-        # 'task' and 'language' parameters are supported for multilingual models only
-        language="<|en|>",
-        task="transcribe",
-        streamer=streamer,
-    )
-
-    print()
+device = "CPU" # GPU can be used as well
+pipe = openvino_genai.WhisperPipeline("whisper-base", device)
+raw_speech = read_wav("sample.wav")
+print(pipe.generate(raw_speech))
 ```
 
 
@@ -271,11 +250,12 @@ def main():
 NOTE: This sample is a simplified version of the full sample that is available [here](./samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp)
 
 ```cpp
+#include <iostream>
+
 #include "audio_utils.hpp"
 #include "openvino/genai/whisper_pipeline.hpp"
 
-int main(int argc, char* argv[]) try {
-
+int main(int argc, char* argv[]) {
     std::filesystem::path models_path = argv[1];
     std::string wav_file_path = argv[2];
     std::string device = "CPU"; // GPU can be used as well
@@ -284,20 +264,7 @@ int main(int argc, char* argv[]) try {
 
     ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path);
 
-    ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json");
-    config.max_new_tokens = 100;
-    // 'task' and 'language' parameters are supported for multilingual models only
-    config.language = "<|en|>";
-    config.task = "transcribe";
-
-    auto streamer = [](std::string word) {
-        std::cout << word;
-        return false;
-    };
-
-    pipeline.generate(raw_speech, config, streamer);
-
-    std::cout << std::endl;
+    std::cout << pipeline.generate(raw_speech, ov::genai::max_new_tokens(100)) << '\n';
 }
 ```
 

diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md
@@ -17,24 +17,19 @@ optimum-cli export openvino --trust-remote-code --model openai/whisper-base whis
 
 Prepare audio file in wav format with sampling rate 16k Hz.
 
-## Run
+You can download example audio file: https://storage.openvinotoolkit.org/models_contrib/speech/2021.2/librispeech_s5/how_are_you_doing_today.wav
 
-`whisper_speech_recognition whisper-base sample.wav`
+## Run
 
-Output: text transcription of `sample.wav`
+`whisper_speech_recognition whisper-base how_are_you_doing_today.wav`
 
-Models can be downloaded from [OpenAI HuggingFace](https://huggingface.co/openai).
+Output:
+```
+ How are you doing today?
+timestamps: [0, 2] text:  How are you doing today?
+```
 
-Supported Models:
-[openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny)
-[openai/whisper-tiny.en](https://huggingface.co/openai/whisper-tiny.en)
-[openai/whisper-base](https://huggingface.co/openai/whisper-base)
-[openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
-[openai/whisper-small](https://huggingface.co/openai/whisper-small)
-[openai/whisper-small.en](https://huggingface.co/openai/whisper-small.en)
-[openai/whisper-medium](https://huggingface.co/openai/whisper-medium)
-[openai/whisper-medium.en](https://huggingface.co/openai/whisper-medium.en)
-[openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3)
+See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
 ### Troubleshooting
 

diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -11,16 +11,14 @@ int main(int argc, char* argv[]) try {
 
     std::filesystem::path models_path = argv[1];
     std::string wav_file_path = argv[2];
-    std::string device = "CPU"; // GPU can be used as well
+    std::string device = "CPU";  // GPU can be used as well
 
     ov::genai::WhisperPipeline pipeline(models_path, device);
 
-    ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path);
-
     ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json");
     config.max_new_tokens = 100;
     // 'task' and 'language' parameters are supported for multilingual models only
-    config.language = "<|en|>";
+    config.language = "<|en|>";  // can switch to <|zh|> for Chinese language
     config.task = "transcribe";
     config.return_timestamps = true;
 
@@ -29,6 +27,7 @@ int main(int argc, char* argv[]) try {
         return false;
     };
 
+    ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path);
     auto result = pipeline.generate(raw_speech, config, streamer);
 
     std::cout << "\n";

diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md
@@ -15,27 +15,28 @@ optimum-cli export openvino --trust-remote-code --model openai/whisper-base whis
 
 ## Prepare audio file
 
-You can prepare an audio file in WAV format with a sampling rate of 16k Hz using the [`recorder.py`](recorder.py) script. The script records 5 seconds of audio from the microphone. 
+Download example audio file: https://storage.openvinotoolkit.org/models_contrib/speech/2021.2/librispeech_s5/how_are_you_doing_today.wav
+
+Or you can use the [`recorder.py`](recorder.py) script. The script records 5 seconds of audio from the microphone. 
+
+To install `PyAudio` dependency follow the [installation instructions](https://pypi.org/project/PyAudio/).
 
-To install dependencies:
-```
-pip install pyaudio
-```
 To run the script:
 ```
 python recorder.py
 ```
 
 ## Run the Whisper model
 
-`whisper_speech_recognition whisper-base sample.wav`
+`whisper_speech_recognition whisper-base how_are_you_doing_today.wav`
 
-Output: text transcription of `sample.wav`
-
-Models can be downloaded from [OpenAI HuggingFace](https://huggingface.co/openai).
+Output:
+```
+ How are you doing today?
+timestamps: [0, 2] text:  How are you doing today?
+```
 
-Supported Models:
-[openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) | [openai/whisper-tiny.en](https://huggingface.co/openai/whisper-tiny.en) | [openai/whisper-base](https://huggingface.co/openai/whisper-base) | [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en) | [openai/whisper-small](https://huggingface.co/openai/whisper-small) | [openai/whisper-small.en](https://huggingface.co/openai/whisper-small.en) | [openai/whisper-medium](https://huggingface.co/openai/whisper-medium) | [openai/whisper-medium.en](https://huggingface.co/openai/whisper-medium.en) | [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3)
+See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
 ### Troubleshooting
 

diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
@@ -18,18 +18,18 @@ def main():
     parser.add_argument("wav_file_path")
     args = parser.parse_args()
 
+    device = "CPU"  # GPU can be used as well
+    pipe = openvino_genai.WhisperPipeline(args.model_dir, device)
+
     config = openvino_genai.WhisperGenerationConfig(
         args.model_dir + "/generation_config.json"
     )
-    config.max_new_tokens = 100 # increase this based on your speech length
+    config.max_new_tokens = 100  # increase this based on your speech length
     # 'task' and 'language' parameters are supported for multilingual models only
-    config.language = "<|en|>" # can switch to <|zh|> for Chinese language 
+    config.language = "<|en|>"  # can switch to <|zh|> for Chinese language
     config.task = "transcribe"
     config.return_timestamps = True
 
-    device = 'CPU'  # GPU can be used as well
-    pipe = openvino_genai.WhisperPipeline(args.model_dir, device)
-
     def streamer(word: str) -> bool:
         print(word, end="")
         return False

diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
@@ -102,7 +102,6 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
 /*
  * utils that allow to use generate and operator() in the following way:
  * pipe.generate(input_ids, ov::genai::max_new_tokens(200),...)
- * pipe(text, ov::genai::max_new_tokens(200),...)
  */
 
 static constexpr ov::Property<std::vector<int64_t>> begin_suppress_tokens{"begin_suppress_tokens"};

diff --git a/src/cpp/include/openvino/genai/whisper_pipeline.hpp b/src/cpp/include/openvino/genai/whisper_pipeline.hpp
@@ -33,6 +33,9 @@ struct WhisperDecodedResults : public DecodedResults {
     std::optional<std::vector<WhisperDecodedResultChunk>> chunks = std::nullopt;
 };
 
+/**
+ * @brief Automatic speech recognition pipeline
+ */
 class OPENVINO_GENAI_EXPORTS WhisperPipeline {
     class Impl;
     std::unique_ptr<Impl> m_impl;
@@ -58,11 +61,10 @@ class OPENVINO_GENAI_EXPORTS WhisperPipeline {
      * @param device optional device
      * @param properties optional properties
      */
-    template <typename... Properties, typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
-    WhisperPipeline(const std::string& models_path,
-                    const std::string& device,
-                    Properties&&... properties)
-        : WhisperPipeline(models_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    WhisperPipeline(const std::filesystem::path& models_path, const std::string& device, Properties&&... properties)
+        : WhisperPipeline(models_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) {}
 
     ~WhisperPipeline();
 

diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
@@ -206,8 +206,8 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <th>Example HuggingFace Models</th>
     </tr>
     <tr>
-      <td>InternVL2</td>
       <td><code>InternVL2</code></td>
+      <td>InternVL2</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/OpenGVLab/InternVL2-1B"><code>OpenGVLab/InternVL2-1B</code></a></li>
@@ -218,17 +218,17 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       </td>
     </tr>
     <tr>
-      <td>LLaVA</td>
-      <td><code>LLaVA-v1.5</code></td>
+      <td><code>LLaVA</code></td>
+      <td>LLaVA-v1.5</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf"><code>llava-hf/llava-1.5-7b-hf</code></a></li>
         </ul>
       </td>
     </tr>
     <tr>
-      <td>LLaVA-NeXT</td>
-      <td><code>LLaVa-v1.6</code></td>
+      <td><code>LLaVA-NeXT</code></td>
+      <td>LLaVa-v1.6</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf"><code>llava-hf/llava-v1.6-mistral-7b-hf</code></a></li>
@@ -238,8 +238,8 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       </td>
     </tr>
     <tr>
-      <td>MiniCPMV</td>
-      <td><code>MiniCPM-V-2_6</code></td>
+      <td><code>MiniCPMV</code></td>
+      <td>MiniCPM-V-2_6</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/openbmb/MiniCPM-V-2_6"><code>openbmb/MiniCPM-V-2_6</code></a></li>
@@ -249,6 +249,44 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
   </tbody>
 </table>
 
+## Whisper models
+
+<table>
+  <tbody style="vertical-align: top;">
+    <tr>
+      <th>Architecture</th>
+      <th>Models</th>
+      <th>Example HuggingFace Models</th>
+    </tr>
+    <tr>
+      <td rowspan=2><code>WhisperForConditionalGeneration</code></td>
+      <td>Whisper</td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/openai/whisper-tiny"><code>openai/whisper-tiny</code></a></li>
+          <li><a href="https://huggingface.co/openai/whisper-tiny.en"><code>openai/whisper-tiny.en</code></a></li>
+          <li><a href="https://huggingface.co/openai/whisper-base"><code>openai/whisper-base</code></a></li>
+          <li><a href="https://huggingface.co/openai/whisper-base.en"><code>openai/whisper-base.en</code></a></li>
+          <li><a href="https://huggingface.co/openai/whisper-small"><code>openai/whisper-small</code></a></li>
+          <li><a href="https://huggingface.co/openai/whisper-small.en"><code>openai/whisper-small.en</code></a></li>
+          <li><a href="https://huggingface.co/openai/whisper-medium"><code>openai/whisper-medium</code></a></li>
+          <li><a href="https://huggingface.co/openai/whisper-medium.en"><code>openai/whisper-medium.en</code></a></li>
+          <li><a href="https://huggingface.co/openai/whisper-large-v3"><code>openai/whisper-large-v3</code></a></li>
+        </ul>
+      </td>
+    </tr>
+    <tr>
+      <td>Distil-Whisper</td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/distil-whisper/distil-small.en"><code>distil-whisper/distil-small.en</code></a></li>
+          <li><a href="https://huggingface.co/distil-whisper/distil-medium.en"><code>distil-whisper/distil-medium.en</code></a></li>
+          <li><a href="https://huggingface.co/distil-whisper/distil-large-v3"><code>distil-whisper/distil-large-v3</code></a></li>
+        </ul>
+      </td>
+    </tr>
+  </tbody>
+</table>
 Some models may require access request submission on the Hugging Face page to be downloaded.
 
 If https://huggingface.co/ is down, the conversion step won't be able to download the models.