collabora · ywy366607 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/Audio-Transcription-Chrome/README.md b/Audio-Transcription-Chrome/README.md
@@ -29,17 +29,37 @@ When using the Audio Transcription extension, you have the following options:
  - **Language**: Select the target language for transcription or translation. You can choose from a variety of languages supported by OpenAI-whisper.
  - **Task:** Choose the specific task to perform on the audio. You can select either "transcribe" for transcription or "translate" to translate the audio to English.
  - **Model Size**: Select the whisper model size to run the server with.
+ - **Target Language**: Select the target language for translation. This option allows you to translate the transcribed text into any language supported by the large language model (LLM).
 
 ### Getting Started
 - Make sure the transcription server is running properly. To know more about how to start the server, see the [documentation here](https://github.com/collabora/whisper-live).
 - Just click on the Chrome Extension which should show 2 options
   - **Start Capture** : Starts capturing the audio in the current tab and sends the captured audio to the server for transcription. This also creates an element to show the transcriptions recieved from the server on the current tab.
   - **Stop Capture** - Stops capturing the audio.
 
+### Using the LLM Translation Feature
+The new LLM translation feature allows you to translate the transcribed text into any language supported by the large language model. To use this feature, follow these steps:
+1. Open the options page of the Chrome extension.
+2. Select the target language for translation from the dropdown menu.
+3. Start capturing the audio in the current tab.
+4. The transcribed text will be sent to the large language model for translation.
+5. The translated text will be displayed on the current tab.
+
+### Using the LLM Translation Feature via Command Line
+The new LLM translation feature can also be used via the command line. To use this feature, follow these steps:
+1. Make sure the transcription server is running properly. To know more about how to start the server, see the [documentation here](https://github.com/collabora/whisper-live).
+2. Use the following command to start the transcription server with the LLM translation feature:
+   ```
+   python whisper_live/server.py --host <host> --port <port> --backend <backend> --model <model> --task <task> --language <language> --target_language <target_language>
+   ```
+3. The transcribed text will be sent to the large language model for translation.
+4. The translated text will be displayed on the command line.
+
+### Configuring the Large Model API
+To configure the large model API, you can modify the `whisper_live/config.py` file. This file contains the configuration for the large model API, including the API URL. You can set the API URL by calling the `set_large_model_api_url` function in the `whisper_live/config.py` file.
 
 ## Limitations
 This extension requires an internet connection to stream audio and receive transcriptions. The accuracy of the transcriptions may vary depending on the audio quality and the performance of the server-side transcription service. The extension may consume additional system resources while running, especially when streaming audio.
 
 ## Note
 The extension relies on a properly running transcription server with multilingual support. Please follow the server documentation for setup and configuration.
-
diff --git a/Audio-Transcription-Chrome/background.js b/Audio-Transcription-Chrome/background.js
@@ -159,6 +159,7 @@ async function startCapture(options) {
           task: options.task,
           modelSize: options.modelSize,
           useVad: options.useVad,
+          targetLanguage: options.targetLanguage, // Added target language for translation
         },
       });
     } else {
@@ -188,6 +189,25 @@ async function stopCapture() {
 }
 
 
+/**
+ * Sends transcribed text to a large model for translation and sends the translated text to the client.
+ * @param {string} text - The transcribed text to be translated.
+ * @param {string} targetLanguage - The target language for translation.
+ * @returns {Promise<string>} - A Promise that resolves to the translated text.
+ */
+async function translateText(text, targetLanguage) {
+  // Placeholder function for sending transcribed text to a large model for translation
+  // Implement the actual translation logic here
+  return new Promise((resolve) => {
+    // Simulate translation delay
+    setTimeout(() => {
+      const translatedText = `Translated (${targetLanguage}): ${text}`;
+      resolve(translatedText);
+    }, 1000);
+  });
+}
+
+
 /**
  * Listens for messages from the runtime and performs corresponding actions.
  * @param {Object} message - The message received from the runtime.
@@ -205,6 +225,10 @@ chrome.runtime.onMessage.addListener(async (message) => {
     chrome.runtime.sendMessage({ action: "toggleCaptureButtons", data: false });
     chrome.storage.local.set({ capturingState: { isCapturing: false } })
     stopCapture();
+  } else if (message.action === "translateText") {
+    const { text, targetLanguage } = message;
+    const translatedText = await translateText(text, targetLanguage);
+    chrome.runtime.sendMessage({ action: "translatedText", translatedText });
   }
 });
 

diff --git a/Audio-Transcription-Chrome/options.js b/Audio-Transcription-Chrome/options.js
@@ -100,7 +100,8 @@ async function startRecord(option) {
           language: option.language,
           task: option.task,
           model: option.modelSize,
-          use_vad: option.useVad
+          use_vad: option.useVad,
+          targetLanguage: option.targetLanguage, // Added target language for translation
         })
       );
     };

diff --git a/whisper_live.egg-info/PKG-INFO b/whisper_live.egg-info/PKG-INFO
@@ -0,0 +1,207 @@
+Metadata-Version: 2.1
+Name: whisper-live
+Version: 0.5.1
+Summary: A nearly-live implementation of OpenAI's Whisper.
+Home-page: https://github.com/collabora/WhisperLive
+Author: Collabora Ltd
+Author-email: [email protected]
+License: MIT
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: PyAudio
+Requires-Dist: faster-whisper==1.0.1
+Requires-Dist: torch
+Requires-Dist: torchaudio
+Requires-Dist: websockets
+Requires-Dist: onnxruntime==1.16.0
+Requires-Dist: ffmpeg-python
+Requires-Dist: scipy
+Requires-Dist: websocket-client
+Requires-Dist: numba
+Requires-Dist: openai-whisper
+Requires-Dist: kaldialign
+Requires-Dist: soundfile
+
+# WhisperLive
+
+<h2 align="center">
+  <a href="https://www.youtube.com/watch?v=0PHWCApIcCI"><img
+src="https://img.youtube.com/vi/0PHWCApIcCI/0.jpg" style="background-color:rgba(0,0,0,0);" height=300 alt="WhisperLive"></a>
+  <br><br>A nearly-live implementation of OpenAI's Whisper.
+<br><br>
+</h2>
+
+This project is a real-time transcription application that uses the OpenAI Whisper model
+to convert speech input into text output. It can be used to transcribe both live audio
+input from microphone and pre-recorded audio files.
+
+## Installation
+- Install PyAudio and ffmpeg
+```bash
+ bash scripts/setup.sh
+```
+
+- Install whisper-live from pip
+```bash
+ pip install whisper-live
+```
+
+### Setting up NVIDIA/TensorRT-LLM for TensorRT backend
+- Please follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) for setup of [NVIDIA/TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and for building Whisper-TensorRT engine.
+
+## Getting Started
+The server supports two backends `faster_whisper` and `tensorrt`. If running `tensorrt` backend follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md)
+
+### Running the Server
+- [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) backend
+```bash
+python3 run_server.py --port 9090 \
+                      --backend faster_whisper
+
+# running with custom model
+python3 run_server.py --port 9090 \
+                      --backend faster_whisper \
+                      -fw "/path/to/custom/faster/whisper/model"
+```
+
+- TensorRT backend. Currently, we recommend to only use the docker setup for TensorRT. Follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) which works as expected. Make sure to build your TensorRT Engines before running the server with TensorRT backend.
+```bash
+# Run English only model
+python3 run_server.py -p 9090 \
+                      -b tensorrt \
+                      -trt /home/TensorRT-LLM/examples/whisper/whisper_small_en
+
+# Run Multilingual model
+python3 run_server.py -p 9090 \
+                      -b tensorrt \
+                      -trt /home/TensorRT-LLM/examples/whisper/whisper_small \
+                      -m
+```
+#### Controlling OpenMP Threads
+To control the number of threads used by OpenMP, you can set the `OMP_NUM_THREADS` environment variable. This is useful for managing CPU resources and ensuring consistent performance. If not specified, `OMP_NUM_THREADS` is set to `1` by default. You can change this by using the `--omp_num_threads` argument:
+```bash
+python3 run_server.py --port 9090 \
+                      --backend faster_whisper \
+                      --omp_num_threads 4
+```
+
+#### Single model mode
+By default, when running the server without specifying a model, the server will instantiate a new whisper model for every client connection. This has the advantage, that the server can use different model sizes, based on the client's requested model size. On the other hand, it also means you have to wait for the model to be loaded upon client connection and you will have increased (V)RAM usage.
+
+When serving a custom TensorRT model using the `-trt` or a custom faster_whisper model using the `-fw` option, the server will instead only instantiate the custom model once and then reuse it for all client connections.
+
+If you don't want this, set `--no_single_model`.
+
+
+### Running the Client
+- Initializing the client with below parameters:
+  - `lang`: Language of the input audio, applicable only if using a multilingual model.
+  - `translate`: If set to `True` then translate from any language to `en`.
+  - `model`: Whisper model size.
+  - `use_vad`: Whether to use `Voice Activity Detection` on the server.
+  - `save_output_recording`: Set to True to save the microphone input as a `.wav` file during live transcription. This option is helpful for recording sessions for later playback or analysis. Defaults to `False`. 
+  - `output_recording_filename`: Specifies the `.wav` file path where the microphone input will be saved if `save_output_recording` is set to `True`.
+```python
+from whisper_live.client import TranscriptionClient
+client = TranscriptionClient(
+  "localhost",
+  9090,
+  lang="en",
+  translate=False,
+  model="small",
+  use_vad=False,
+  save_output_recording=True,                         # Only used for microphone input, False by Default
+  output_recording_filename="./output_recording.wav"  # Only used for microphone input
+)
+```
+It connects to the server running on localhost at port 9090. Using a multilingual model, language for the transcription will be automatically detected. You can also use the language option to specify the target language for the transcription, in this case, English ("en"). The translate option should be set to `True` if we want to translate from the source language to English and `False` if we want to transcribe in the source language.
+
+- Transcribe an audio file:
+```python
+client("tests/jfk.wav")
+```
+
+- To transcribe from microphone:
+```python
+client()
+```
+
+- To transcribe from a RTSP stream:
+```python
+client(rtsp_url="rtsp://admin:[email protected]/rtsp")
+```
+
+- To transcribe from a HLS stream:
+```python
+client(hls_url="http://as-hls-ww-live.akamaized.net/pool_904/live/ww/bbc_1xtra/bbc_1xtra.isml/bbc_1xtra-audio%3d96000.norewind.m3u8")
+```
+
+## Browser Extensions
+- Run the server with your desired backend as shown [here](https://github.com/collabora/WhisperLive?tab=readme-ov-file#running-the-server).
+- Transcribe audio directly from your browser using our Chrome or Firefox extensions. Refer to [Audio-Transcription-Chrome](https://github.com/collabora/whisper-live/tree/main/Audio-Transcription-Chrome#readme) and [Audio-Transcription-Firefox](https://github.com/collabora/whisper-live/tree/main/Audio-Transcription-Firefox#readme) for setup instructions.
+
+## Whisper Live Server in Docker
+- GPU
+  - Faster-Whisper
+  ```bash
+  docker run -it --gpus all -p 9090:9090 ghcr.io/collabora/whisperlive-gpu:latest
+  ```
+
+  - TensorRT. 
+  ```bash
+  docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it ghcr.io/collabora/whisperlive-tensorrt
+
+  # Build small.en engine
+  bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en
+
+  # Run server with small.en
+  python3 run_server.py --port 9090 \
+                        --backend tensorrt \
+                        --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en"
+  ```
+
+- CPU
+```bash
+docker run -it -p 9090:9090 ghcr.io/collabora/whisperlive-cpu:latest
+```
+**Note**: By default we use "small" model size. To build docker image for a different model size, change the size in server.py and then build the docker image.
+
+## Future Work
+- [ ] Add translation to other languages on top of transcription.
+- [x] TensorRT backend for Whisper.
+
+## Contact
+
+We are available to help you with both Open Source and proprietary AI projects. You can reach us via the Collabora website or [[email protected]](mailto:[email protected]) and [[email protected]](mailto:[email protected]).
+
+## Citations
+```bibtex
+@article{Whisper
+  title = {Robust Speech Recognition via Large-Scale Weak Supervision},
+  url = {https://arxiv.org/abs/2212.04356},
+  author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
+  publisher = {arXiv},
+  year = {2022},
+}
+```
+
+```bibtex
+@misc{Silero VAD,
+  author = {Silero Team},
+  title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier},
+  year = {2021},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/snakers4/silero-vad}},
+  email = {[email protected]}
+}
diff --git a/whisper_live.egg-info/SOURCES.txt b/whisper_live.egg-info/SOURCES.txt
@@ -0,0 +1,22 @@
+LICENSE
+README.md
+setup.py
+tests/__init__.py
+tests/test_client.py
+tests/test_server.py
+tests/test_vad.py
+whisper_live/__init__.py
+whisper_live/__version__.py
+whisper_live/client.py
+whisper_live/config.py
+whisper_live/server.py
+whisper_live/tensorrt_utils.py
+whisper_live/transcriber.py
+whisper_live/transcriber_tensorrt.py
+whisper_live/utils.py
+whisper_live/vad.py
+whisper_live.egg-info/PKG-INFO
+whisper_live.egg-info/SOURCES.txt
+whisper_live.egg-info/dependency_links.txt
+whisper_live.egg-info/requires.txt
+whisper_live.egg-info/top_level.txt
diff --git a/whisper_live.egg-info/dependency_links.txt b/whisper_live.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/whisper_live.egg-info/requires.txt b/whisper_live.egg-info/requires.txt
@@ -0,0 +1,13 @@
+PyAudio
+faster-whisper==1.0.1
+torch
+torchaudio
+websockets
+onnxruntime==1.16.0
+ffmpeg-python
+scipy
+websocket-client
+numba
+openai-whisper
+kaldialign
+soundfile
diff --git a/whisper_live.egg-info/top_level.txt b/whisper_live.egg-info/top_level.txt
@@ -0,0 +1,2 @@
+tests
+whisper_live
diff --git a/whisper_live/config.py b/whisper_live/config.py
@@ -0,0 +1,12 @@
+import os
+
+class Config:
+    def __init__(self):
+        self.large_model_api_url = os.getenv("LARGE_MODEL_API_URL", "http://default-api-url.com")
+
+    def get_large_model_api_url(self):
+        return self.large_model_api_url
+
+    def set_large_model_api_url(self, url):
+        self.large_model_api_url = url
+        os.environ["LARGE_MODEL_API_URL"] = url
diff --git a/whisper_live/server.py b/whisper_live/server.py
@@ -302,6 +302,7 @@ def recv_audio(self,
             while not self.client_manager.is_client_timeout(websocket):
                 if not self.process_audio_frames(websocket):
                     break
+                self.handle_translation(websocket)  # Handle translation requests
         except ConnectionClosed:
             logging.info("Connection closed by client")
         except Exception as e:
@@ -312,6 +313,30 @@ def recv_audio(self,
                 websocket.close()
             del websocket
 
+    def handle_translation(self, websocket):
+        """
+        Handle translation requests from the client.
+
+        Args:
+            websocket (WebSocket): The WebSocket connection for the client.
+        """
+        client = self.client_manager.get_client(websocket)
+        if client:
+            while True:
+                message = websocket.recv()
+                if message == "END_OF_TRANSLATION":
+                    break
+                data = json.loads(message)
+                text = data.get("text")
+                target_language = data.get("target_language")
+                if text and target_language:
+                    translated_text = client.transcriber.translate(text, target_language)
+                    response = {
+                        "uid": client.client_uid,
+                        "translated_text": translated_text
+                    }
+                    websocket.send(json.dumps(response))
+
     def run(self,
             host,
             port=9090,