collabora · zoq · Dec 20, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/Audio-Transcription-Chrome/README.md b/Audio-Transcription-Chrome/README.md
@@ -29,6 +29,7 @@ When using the Audio Transcription extension, you have the following options:
  - **Use Multilingual Model**: Enable this option to utilize the multilingual capabilities of OpenAI-whisper.
  - **Language**: Select the target language for transcription or translation. You can choose from a variety of languages supported by OpenAI-whisper.
  - **Task:** Choose the specific task to perform on the audio. You can select either "transcribe" for transcription or "translate" to translate the audio to English.
+ - **Model Size**: Select the whisper model size to run the server with.
 
 ### Getting Started
 - Make sure the transcription server is running properly. To know more about how to start the server, see the [documentation here](https://github.com/collabora/whisper-live).

diff --git a/Audio-Transcription-Chrome/background.js b/Audio-Transcription-Chrome/background.js
@@ -156,7 +156,8 @@ async function startCapture(options) {
           port: options.port, 
           multilingual: options.useMultilingual,
           language: options.language,
-          task: options.task
+          task: options.task,
+          modelSize: options.modelSize
         },
       });
     } else {

diff --git a/Audio-Transcription-Chrome/options.js b/Audio-Transcription-Chrome/options.js
@@ -102,7 +102,8 @@ async function startRecord(option) {
           uid: uuid,
           multilingual: option.multilingual,
           language: option.language,
-          task: option.task
+          task: option.task,
+          model_size: option.modelSize
         })
       );
     };

diff --git a/Audio-Transcription-Chrome/popup.html b/Audio-Transcription-Chrome/popup.html
@@ -125,11 +125,22 @@
   </div>
   <div class="dropdown-container">
     <label for="taskDropdown">Select task:</label>
-    <select id="taskDropdown" disabled>
+    <select id="taskDropdown" >
       <option value="">Select Task</option>
       <option value="transcribe" selected>Transcribe</option>
       <option value="translate">Translate</option>
     </select>
   </div>
+  <div class="dropdown-container">
+    <label for="modelSizeDropdown">Select Model Size:</label>
+    <select id="modelSizeDropdown">
+      <option value="">Select Task</option>
+      <option value="tiny">Tiny</option>
+      <option value="base">Base</option>
+      <option value="small" selected>Small</option>
+      <option value="medium">Medium</option>
+      <option value="large-v2">Large-v2</option>
+    </select>
+  </div>
 </body>
 </html>
diff --git a/Audio-Transcription-Chrome/popup.js b/Audio-Transcription-Chrome/popup.js
@@ -7,8 +7,10 @@ document.addEventListener("DOMContentLoaded", function () {
   const useMultilingualCheckbox = document.getElementById('useMultilingualCheckbox');
   const languageDropdown = document.getElementById('languageDropdown');
   const taskDropdown = document.getElementById('taskDropdown');
+  const modelSizeDropdown = document.getElementById('modelSizeDropdown');
   let selectedLanguage = null;
   let selectedTask = taskDropdown.value;
+  let selectedModelSize = modelSizeDropdown.value;
 
   // Add click event listeners to the buttons
   startButton.addEventListener("click", startCapture);
@@ -52,6 +54,13 @@ document.addEventListener("DOMContentLoaded", function () {
     }
   });
 
+  chrome.storage.local.get("selectedModelSize", ({ selectedModelSize: storedModelSize }) => {
+    if (storedModelSize !== undefined) {
+      modelSizeDropdown.value = storedModelSize;
+      selectedModelSize = storedModelSize;
+    }
+  });
+
   // Function to handle the start capture button click event
   async function startCapture() {
     // Ignore click if the button is disabled
@@ -64,7 +73,7 @@ document.addEventListener("DOMContentLoaded", function () {
 
     // Send a message to the background script to start capturing
     let host = "localhost";
-    let port = "9090";
+    let port = "5901";
     const useCollaboraServer = useServerCheckbox.checked;
     if (useCollaboraServer){
       host = "transcription.kurg.org"
@@ -79,7 +88,8 @@ document.addEventListener("DOMContentLoaded", function () {
         port: port,
         useMultilingual: useMultilingualCheckbox.checked,
         language: selectedLanguage,
-        task: selectedTask
+        task: selectedTask,
+        modelSize: selectedModelSize
       }, () => {
         // Update capturing state in storage and toggle the buttons
         chrome.storage.local.set({ capturingState: { isCapturing: true } }, () => {
@@ -120,6 +130,7 @@ document.addEventListener("DOMContentLoaded", function () {
     stopButton.disabled = !isCapturing;
     useServerCheckbox.disabled = isCapturing; 
     useMultilingualCheckbox.disabled = isCapturing;
+    modelSizeDropdown.disabled = isCapturing;
 
     startButton.classList.toggle("disabled", isCapturing);
     stopButton.classList.toggle("disabled", !isCapturing);
@@ -157,6 +168,11 @@ document.addEventListener("DOMContentLoaded", function () {
     chrome.storage.local.set({ selectedTask });
   });
 
+  modelSizeDropdown.addEventListener('change', function() {
+    selectedModelSize = modelSizeDropdown.value;
+    chrome.storage.local.set({ selectedModelSize });
+  });
+
   chrome.runtime.onMessage.addListener(async (request, sender, sendResponse) => {
     if (request.action === "updateSelectedLanguage") {
       const detectedLanguage = request.detectedLanguage;

diff --git a/Audio-Transcription-Firefox/README.md b/Audio-Transcription-Firefox/README.md
@@ -27,6 +27,7 @@ When using the Audio Transcription extension, you have the following options:
  - **Use Multilingual Model**: Enable this option to utilize the multilingual capabilities of OpenAI-whisper.
  - **Language**: Select the target language for transcription or translation. You can choose from a variety of languages supported by OpenAI-whisper.
  - **Task:** Choose the specific task to perform on the audio. You can select either "transcribe" for transcription or "translate" to translate the audio to English.
+  - **Model Size**: Select the whisper model size to run the server with.
 
 ### Getting Started
 - Make sure the transcription server is running properly. To know more about how to start the server, see the [documentation here](https://github.com/collabora/whisper-live).

diff --git a/Audio-Transcription-Firefox/content.js b/Audio-Transcription-Firefox/content.js
@@ -77,7 +77,8 @@ function startRecording(data) {
             uid: uuid,
             multilingual: data.useMultilingual,
             language: data.language,
-            task: data.task
+            task: data.task,
+            model_size: data.modelSize
         })
       );
     };

diff --git a/Audio-Transcription-Firefox/popup.html b/Audio-Transcription-Firefox/popup.html
@@ -133,5 +133,16 @@
       <option value="translate">Translate</option>
     </select>
   </div>
+  <div class="dropdown-container">
+    <label for="modelSizeDropdown">Select Model Size:</label>
+    <select id="modelSizeDropdown">
+      <option value="">Select Task</option>
+      <option value="tiny">Tiny</option>
+      <option value="base">Base</option>
+      <option value="small" selected>Small</option>
+      <option value="medium">Medium</option>
+      <option value="large-v2">Large-v2</option>
+    </select>
+  </div>
 </body>
 </html>
diff --git a/Audio-Transcription-Firefox/popup.js b/Audio-Transcription-Firefox/popup.js
@@ -6,8 +6,11 @@ document.addEventListener("DOMContentLoaded", function() {
   const useMultilingualCheckbox = document.getElementById('useMultilingualCheckbox');
   const languageDropdown = document.getElementById('languageDropdown');
   const taskDropdown = document.getElementById('taskDropdown');
+  const modelSizeDropdown = document.getElementById('modelSizeDropdown');
   let selectedLanguage = null;
   let selectedTask = taskDropdown.value;
+  let selectedModelSize = modelSizeDropdown.value;
+
 
   browser.storage.local.get("capturingState")
     .then(function(result) {
@@ -54,9 +57,16 @@ document.addEventListener("DOMContentLoaded", function() {
     }
   });
 
+  browser.storage.local.get("selectedModelSize", ({ selectedModelSize: storedModelSize }) => {
+    if (storedModelSize !== undefined) {
+      modelSizeDropdown.value = storedModelSize;
+      selectedModelSize = storedModelSize;
+    }
+  });
+
   startButton.addEventListener("click", function() {
     let host = "localhost";
-    let port = "9090";
+    let port = "5901";
     const useCollaboraServer = useServerCheckbox.checked;
 
     if (useCollaboraServer){
@@ -75,7 +85,8 @@ document.addEventListener("DOMContentLoaded", function() {
               port: port,
               useMultilingual: useMultilingualCheckbox.checked,
               language: selectedLanguage,
-              task: selectedTask
+              task: selectedTask,
+              modelSize: selectedModelSize
             } 
           });
         toggleCaptureButtons(true);
@@ -115,6 +126,7 @@ document.addEventListener("DOMContentLoaded", function() {
     stopButton.disabled = !isCapturing;
     useServerCheckbox.disabled = isCapturing;
     useMultilingualCheckbox.disabled = isCapturing;
+    modelSizeDropdown.disabled = isCapturing;
 
     startButton.classList.toggle("disabled", isCapturing);
     stopButton.classList.toggle("disabled", !isCapturing);
@@ -152,6 +164,11 @@ document.addEventListener("DOMContentLoaded", function() {
     browser.storage.local.set({ selectedTask });
   });
 
+  modelSizeDropdown.addEventListener('change', function() {
+    selectedModelSize = modelSizeDropdown.value;
+    browser.storage.local.set({ selectedModelSize });
+  });
+
   browser.runtime.onMessage.addListener((request, sender, sendResponse) => {
     if (request.action === "updateSelectedLanguage") {
       const detectedLanguage = request.data;

diff --git a/README.md b/README.md
@@ -28,18 +28,33 @@ Unlike traditional speech recognition systems that rely on continuous audio stre
     - To transcribe an audio file:
     ```python
       from whisper_live.client import TranscriptionClient
-      client = TranscriptionClient("localhost", 9090, is_multilingual=True, lang="hi", translate=True)
-      client(audio_file_path)
+      client = TranscriptionClient(
+        "localhost",
+        9090,
+        is_multilingual=False,
+        lang="en",
+        translate=False,
+        model_size="small"
+      )
+
+      client("tests/jfk.wav")
     ```
-    This command transcribes the specified audio file (audio.wav) using the Whisper model. It connects to the server running on localhost at port 9090. It also enables the multilingual feature, allowing transcription in multiple languages. The language option specifies the target language for transcription, in this case, Hindi ("hi"). The translate option should be set to `True` if we want to translate from the source language to English and `False` if we want to transcribe in the source language.
+    This command transcribes the specified audio file (audio.wav) using the Whisper model. It connects to the server running on localhost at port 9090. It can also enable the multilingual feature, allowing transcription in multiple languages. The language option specifies the target language for transcription, in this case, English ("en"). The translate option should be set to `True` if we want to translate from the source language to English and `False` if we want to transcribe in the source language.
 
     - To transcribe from microphone:
     ```python
       from whisper_live.client import TranscriptionClient
-      client = TranscriptionClient(host, port, is_multilingual=True, lang="hi", translate=True)
+      client = TranscriptionClient(
+        "localhost",
+        9090,
+        is_multilingual=True,
+        lang="hi",
+        translate=True,
+        model_size="small"
+      )
       client()
     ```
-    This command captures audio from the microphone and sends it to the server for transcription. It uses the same options as the previous command, enabling the multilingual feature and specifying the target language and task.
+    This command captures audio from the microphone and sends it to the server for transcription. It uses the multilingual option with `hi` as the selectelanguage, enabling the multilingual feature and specifying the target language and task. We use whisper `small` by default but can be changed to any other option based on the requirements and the hardware running the server.
 
     - To trasncribe from a HLS stream:
     ```python

diff --git a/tests/jfk.flac b/tests/jfk.flac
diff --git a/whisper_live/client.py b/whisper_live/client.py
@@ -50,7 +50,7 @@ class Client:
     INSTANCES = {}
 
     def __init__(
-        self, host=None, port=None, is_multilingual=False, lang=None, translate=False
+        self, host=None, port=None, is_multilingual=False, lang=None, translate=False, model_size="small"
     ):
         """
         Initializes a Client instance for audio recording and streaming to a server.
@@ -80,7 +80,9 @@ def __init__(
         self.last_response_recieved = None
         self.disconnect_if_no_response_for = 15
         self.multilingual = is_multilingual
-        self.language = lang if is_multilingual else "en"
+        self.language = lang
+        self.model_size = model_size
+        self.server_error = False
         if translate:
             self.task = "translate"
 
@@ -140,11 +142,16 @@ def on_message(self, ws, message):
             print("[ERROR]: invalid client uid")
             return
 
-        if "status" in message.keys() and message["status"] == "WAIT":
-            self.waiting = True
-            print(
-                f"[INFO]:Server is full. Estimated wait time {round(message['message'])} minutes."
-            )
+        if "status" in message.keys():
+            if message["status"] == "WAIT":
+                self.waiting = True
+                print(
+                    f"[INFO]:Server is full. Estimated wait time {round(message['message'])} minutes."
+                )
+            elif message["status"] == "ERROR":
+                print(f"Message from Server: {message['message']}")
+                self.server_error = True
+            return
 
         if "message" in message.keys() and message["message"] == "DISCONNECT":
             print("[INFO]: Server overtime disconnected.")
@@ -213,6 +220,7 @@ def on_open(self, ws):
                     "multilingual": self.multilingual,
                     "language": self.language,
                     "task": self.task,
+                    "model_size": self.model_size,
                 }
             )
         )
@@ -497,8 +505,8 @@ class TranscriptionClient:
         transcription_client()
         ```
     """
-    def __init__(self, host, port, is_multilingual=False, lang=None, translate=False):
-        self.client = Client(host, port, is_multilingual, lang, translate)
+    def __init__(self, host, port, is_multilingual=False, lang=None, translate=False, model_size="small"):
+        self.client = Client(host, port, is_multilingual, lang, translate, model_size)
 
     def __call__(self, audio=None, hls_url=None):
         """
@@ -514,10 +522,10 @@ def __call__(self, audio=None, hls_url=None):
         """
         print("[INFO]: Waiting for server ready ...")
         while not self.client.recording:
-            if self.client.waiting:
+            if self.client.waiting or self.client.server_error:
                 self.client.close_websocket()
                 return
-            pass
+
         print("[INFO]: Server Ready!")
         if hls_url is not None:
             self.client.process_hls_stream(hls_url)