diff --git a/Audio-Transcription-Chrome/README.md b/Audio-Transcription-Chrome/README.md index 0d402901..7392f738 100644 --- a/Audio-Transcription-Chrome/README.md +++ b/Audio-Transcription-Chrome/README.md @@ -29,6 +29,7 @@ When using the Audio Transcription extension, you have the following options: - **Use Multilingual Model**: Enable this option to utilize the multilingual capabilities of OpenAI-whisper. - **Language**: Select the target language for transcription or translation. You can choose from a variety of languages supported by OpenAI-whisper. - **Task:** Choose the specific task to perform on the audio. You can select either "transcribe" for transcription or "translate" to translate the audio to English. + - **Model Size**: Select the whisper model size to run the server with. ### Getting Started - Make sure the transcription server is running properly. To know more about how to start the server, see the [documentation here](https://github.com/collabora/whisper-live). diff --git a/Audio-Transcription-Chrome/background.js b/Audio-Transcription-Chrome/background.js index f860209b..a5028a78 100644 --- a/Audio-Transcription-Chrome/background.js +++ b/Audio-Transcription-Chrome/background.js @@ -156,7 +156,8 @@ async function startCapture(options) { port: options.port, multilingual: options.useMultilingual, language: options.language, - task: options.task + task: options.task, + modelSize: options.modelSize }, }); } else { diff --git a/Audio-Transcription-Chrome/options.js b/Audio-Transcription-Chrome/options.js index af77b386..9717856a 100644 --- a/Audio-Transcription-Chrome/options.js +++ b/Audio-Transcription-Chrome/options.js @@ -102,7 +102,8 @@ async function startRecord(option) { uid: uuid, multilingual: option.multilingual, language: option.language, - task: option.task + task: option.task, + model_size: option.modelSize }) ); }; diff --git a/Audio-Transcription-Chrome/popup.html b/Audio-Transcription-Chrome/popup.html index 4f2b6ec7..79f18ca9 100644 --- a/Audio-Transcription-Chrome/popup.html +++ b/Audio-Transcription-Chrome/popup.html @@ -125,11 +125,22 @@ + diff --git a/Audio-Transcription-Chrome/popup.js b/Audio-Transcription-Chrome/popup.js index f67f8b70..a1dd210c 100644 --- a/Audio-Transcription-Chrome/popup.js +++ b/Audio-Transcription-Chrome/popup.js @@ -7,8 +7,10 @@ document.addEventListener("DOMContentLoaded", function () { const useMultilingualCheckbox = document.getElementById('useMultilingualCheckbox'); const languageDropdown = document.getElementById('languageDropdown'); const taskDropdown = document.getElementById('taskDropdown'); + const modelSizeDropdown = document.getElementById('modelSizeDropdown'); let selectedLanguage = null; let selectedTask = taskDropdown.value; + let selectedModelSize = modelSizeDropdown.value; // Add click event listeners to the buttons startButton.addEventListener("click", startCapture); @@ -52,6 +54,13 @@ document.addEventListener("DOMContentLoaded", function () { } }); + chrome.storage.local.get("selectedModelSize", ({ selectedModelSize: storedModelSize }) => { + if (storedModelSize !== undefined) { + modelSizeDropdown.value = storedModelSize; + selectedModelSize = storedModelSize; + } + }); + // Function to handle the start capture button click event async function startCapture() { // Ignore click if the button is disabled @@ -64,7 +73,7 @@ document.addEventListener("DOMContentLoaded", function () { // Send a message to the background script to start capturing let host = "localhost"; - let port = "9090"; + let port = "5901"; const useCollaboraServer = useServerCheckbox.checked; if (useCollaboraServer){ host = "transcription.kurg.org" @@ -79,7 +88,8 @@ document.addEventListener("DOMContentLoaded", function () { port: port, useMultilingual: useMultilingualCheckbox.checked, language: selectedLanguage, - task: selectedTask + task: selectedTask, + modelSize: selectedModelSize }, () => { // Update capturing state in storage and toggle the buttons chrome.storage.local.set({ capturingState: { isCapturing: true } }, () => { @@ -120,6 +130,7 @@ document.addEventListener("DOMContentLoaded", function () { stopButton.disabled = !isCapturing; useServerCheckbox.disabled = isCapturing; useMultilingualCheckbox.disabled = isCapturing; + modelSizeDropdown.disabled = isCapturing; startButton.classList.toggle("disabled", isCapturing); stopButton.classList.toggle("disabled", !isCapturing); @@ -157,6 +168,11 @@ document.addEventListener("DOMContentLoaded", function () { chrome.storage.local.set({ selectedTask }); }); + modelSizeDropdown.addEventListener('change', function() { + selectedModelSize = modelSizeDropdown.value; + chrome.storage.local.set({ selectedModelSize }); + }); + chrome.runtime.onMessage.addListener(async (request, sender, sendResponse) => { if (request.action === "updateSelectedLanguage") { const detectedLanguage = request.detectedLanguage; diff --git a/Audio-Transcription-Firefox/README.md b/Audio-Transcription-Firefox/README.md index 52e4ceb2..be48b786 100644 --- a/Audio-Transcription-Firefox/README.md +++ b/Audio-Transcription-Firefox/README.md @@ -27,6 +27,7 @@ When using the Audio Transcription extension, you have the following options: - **Use Multilingual Model**: Enable this option to utilize the multilingual capabilities of OpenAI-whisper. - **Language**: Select the target language for transcription or translation. You can choose from a variety of languages supported by OpenAI-whisper. - **Task:** Choose the specific task to perform on the audio. You can select either "transcribe" for transcription or "translate" to translate the audio to English. + - **Model Size**: Select the whisper model size to run the server with. ### Getting Started - Make sure the transcription server is running properly. To know more about how to start the server, see the [documentation here](https://github.com/collabora/whisper-live). diff --git a/Audio-Transcription-Firefox/content.js b/Audio-Transcription-Firefox/content.js index 1d866c8c..682b142a 100644 --- a/Audio-Transcription-Firefox/content.js +++ b/Audio-Transcription-Firefox/content.js @@ -77,7 +77,8 @@ function startRecording(data) { uid: uuid, multilingual: data.useMultilingual, language: data.language, - task: data.task + task: data.task, + model_size: data.modelSize }) ); }; diff --git a/Audio-Transcription-Firefox/popup.html b/Audio-Transcription-Firefox/popup.html index 7ca020ab..2755a762 100644 --- a/Audio-Transcription-Firefox/popup.html +++ b/Audio-Transcription-Firefox/popup.html @@ -133,5 +133,16 @@ + \ No newline at end of file diff --git a/Audio-Transcription-Firefox/popup.js b/Audio-Transcription-Firefox/popup.js index a707fcab..b6785dd9 100644 --- a/Audio-Transcription-Firefox/popup.js +++ b/Audio-Transcription-Firefox/popup.js @@ -6,8 +6,11 @@ document.addEventListener("DOMContentLoaded", function() { const useMultilingualCheckbox = document.getElementById('useMultilingualCheckbox'); const languageDropdown = document.getElementById('languageDropdown'); const taskDropdown = document.getElementById('taskDropdown'); + const modelSizeDropdown = document.getElementById('modelSizeDropdown'); let selectedLanguage = null; let selectedTask = taskDropdown.value; + let selectedModelSize = modelSizeDropdown.value; + browser.storage.local.get("capturingState") .then(function(result) { @@ -54,9 +57,16 @@ document.addEventListener("DOMContentLoaded", function() { } }); + browser.storage.local.get("selectedModelSize", ({ selectedModelSize: storedModelSize }) => { + if (storedModelSize !== undefined) { + modelSizeDropdown.value = storedModelSize; + selectedModelSize = storedModelSize; + } + }); + startButton.addEventListener("click", function() { let host = "localhost"; - let port = "9090"; + let port = "5901"; const useCollaboraServer = useServerCheckbox.checked; if (useCollaboraServer){ @@ -75,7 +85,8 @@ document.addEventListener("DOMContentLoaded", function() { port: port, useMultilingual: useMultilingualCheckbox.checked, language: selectedLanguage, - task: selectedTask + task: selectedTask, + modelSize: selectedModelSize } }); toggleCaptureButtons(true); @@ -115,6 +126,7 @@ document.addEventListener("DOMContentLoaded", function() { stopButton.disabled = !isCapturing; useServerCheckbox.disabled = isCapturing; useMultilingualCheckbox.disabled = isCapturing; + modelSizeDropdown.disabled = isCapturing; startButton.classList.toggle("disabled", isCapturing); stopButton.classList.toggle("disabled", !isCapturing); @@ -152,6 +164,11 @@ document.addEventListener("DOMContentLoaded", function() { browser.storage.local.set({ selectedTask }); }); + modelSizeDropdown.addEventListener('change', function() { + selectedModelSize = modelSizeDropdown.value; + browser.storage.local.set({ selectedModelSize }); + }); + browser.runtime.onMessage.addListener((request, sender, sendResponse) => { if (request.action === "updateSelectedLanguage") { const detectedLanguage = request.data; diff --git a/README.md b/README.md index 69e20e3b..37fc8814 100644 --- a/README.md +++ b/README.md @@ -28,18 +28,33 @@ Unlike traditional speech recognition systems that rely on continuous audio stre - To transcribe an audio file: ```python from whisper_live.client import TranscriptionClient - client = TranscriptionClient("localhost", 9090, is_multilingual=True, lang="hi", translate=True) - client(audio_file_path) + client = TranscriptionClient( + "localhost", + 9090, + is_multilingual=False, + lang="en", + translate=False, + model_size="small" + ) + + client("tests/jfk.wav") ``` - This command transcribes the specified audio file (audio.wav) using the Whisper model. It connects to the server running on localhost at port 9090. It also enables the multilingual feature, allowing transcription in multiple languages. The language option specifies the target language for transcription, in this case, Hindi ("hi"). The translate option should be set to `True` if we want to translate from the source language to English and `False` if we want to transcribe in the source language. + This command transcribes the specified audio file (audio.wav) using the Whisper model. It connects to the server running on localhost at port 9090. It can also enable the multilingual feature, allowing transcription in multiple languages. The language option specifies the target language for transcription, in this case, English ("en"). The translate option should be set to `True` if we want to translate from the source language to English and `False` if we want to transcribe in the source language. - To transcribe from microphone: ```python from whisper_live.client import TranscriptionClient - client = TranscriptionClient(host, port, is_multilingual=True, lang="hi", translate=True) + client = TranscriptionClient( + "localhost", + 9090, + is_multilingual=True, + lang="hi", + translate=True, + model_size="small" + ) client() ``` - This command captures audio from the microphone and sends it to the server for transcription. It uses the same options as the previous command, enabling the multilingual feature and specifying the target language and task. + This command captures audio from the microphone and sends it to the server for transcription. It uses the multilingual option with `hi` as the selectelanguage, enabling the multilingual feature and specifying the target language and task. We use whisper `small` by default but can be changed to any other option based on the requirements and the hardware running the server. - To trasncribe from a HLS stream: ```python diff --git a/tests/jfk.flac b/tests/jfk.flac new file mode 100644 index 00000000..e44b7c13 Binary files /dev/null and b/tests/jfk.flac differ diff --git a/whisper_live/client.py b/whisper_live/client.py index 070845db..633c1237 100644 --- a/whisper_live/client.py +++ b/whisper_live/client.py @@ -50,7 +50,7 @@ class Client: INSTANCES = {} def __init__( - self, host=None, port=None, is_multilingual=False, lang=None, translate=False + self, host=None, port=None, is_multilingual=False, lang=None, translate=False, model_size="small" ): """ Initializes a Client instance for audio recording and streaming to a server. @@ -80,7 +80,9 @@ def __init__( self.last_response_recieved = None self.disconnect_if_no_response_for = 15 self.multilingual = is_multilingual - self.language = lang if is_multilingual else "en" + self.language = lang + self.model_size = model_size + self.server_error = False if translate: self.task = "translate" @@ -140,11 +142,16 @@ def on_message(self, ws, message): print("[ERROR]: invalid client uid") return - if "status" in message.keys() and message["status"] == "WAIT": - self.waiting = True - print( - f"[INFO]:Server is full. Estimated wait time {round(message['message'])} minutes." - ) + if "status" in message.keys(): + if message["status"] == "WAIT": + self.waiting = True + print( + f"[INFO]:Server is full. Estimated wait time {round(message['message'])} minutes." + ) + elif message["status"] == "ERROR": + print(f"Message from Server: {message['message']}") + self.server_error = True + return if "message" in message.keys() and message["message"] == "DISCONNECT": print("[INFO]: Server overtime disconnected.") @@ -213,6 +220,7 @@ def on_open(self, ws): "multilingual": self.multilingual, "language": self.language, "task": self.task, + "model_size": self.model_size, } ) ) @@ -497,8 +505,8 @@ class TranscriptionClient: transcription_client() ``` """ - def __init__(self, host, port, is_multilingual=False, lang=None, translate=False): - self.client = Client(host, port, is_multilingual, lang, translate) + def __init__(self, host, port, is_multilingual=False, lang=None, translate=False, model_size="small"): + self.client = Client(host, port, is_multilingual, lang, translate, model_size) def __call__(self, audio=None, hls_url=None): """ @@ -514,10 +522,10 @@ def __call__(self, audio=None, hls_url=None): """ print("[INFO]: Waiting for server ready ...") while not self.client.recording: - if self.client.waiting: + if self.client.waiting or self.client.server_error: self.client.close_websocket() return - pass + print("[INFO]: Server Ready!") if hls_url is not None: self.client.process_hls_stream(hls_url) diff --git a/whisper_live/server.py b/whisper_live/server.py index 18bf5950..1fbd531a 100644 --- a/whisper_live/server.py +++ b/whisper_live/server.py @@ -101,9 +101,10 @@ def recv_audio(self, websocket): multilingual=options["multilingual"], language=options["language"], task=options["task"], - client_uid=options["uid"] + client_uid=options["uid"], + model_size=options["model_size"] ) - + self.clients[websocket] = client self.clients_start_time[websocket] = time.time() @@ -127,7 +128,8 @@ def recv_audio(self, websocket): except Exception as e: logging.error(e) - self.clients[websocket].cleanup() + if self.clients[websocket].model_size is not None: + self.clients[websocket].cleanup() self.clients.pop(websocket) self.clients_start_time.pop(websocket) logging.info("Connection Closed.") @@ -180,7 +182,16 @@ class ServeClient: SERVER_READY = "SERVER_READY" DISCONNECT = "DISCONNECT" - def __init__(self, websocket, task="transcribe", device=None, multilingual=False, language=None, client_uid=None): + def __init__( + self, + websocket, + task="transcribe", + device=None, + multilingual=False, + language=None, + client_uid=None, + model_size="small" + ): """ Initialize a ServeClient instance. The Whisper model is initialized based on the client's language and device availability. @@ -199,11 +210,22 @@ def __init__(self, websocket, task="transcribe", device=None, multilingual=False self.client_uid = client_uid self.data = b"" self.frames = b"" - self.language = language if multilingual else "en" + self.model_sizes = [ + "tiny", "base", "small", "medium", "large-v2" + ] + self.multilingual = multilingual + self.model_size = self.get_model_size(model_size) + self.language = language if self.multilingual else "en" self.task = task + self.websocket = websocket + device = "cuda" if torch.cuda.is_available() else "cpu" + + if self.model_size == None: + return + self.transcriber = WhisperModel( - "small" if multilingual else "small.en", + self.model_size, device=device, compute_type="int8" if device=="cpu" else "float16", local_files_only=False, @@ -228,7 +250,6 @@ def __init__(self, websocket, task="transcribe", device=None, multilingual=False self.pick_previous_segments = 2 # threading - self.websocket = websocket self.trans_thread = threading.Thread(target=self.speech_to_text) self.trans_thread.start() self.websocket.send( @@ -240,34 +261,30 @@ def __init__(self, websocket, task="transcribe", device=None, multilingual=False ) ) - def fill_output(self, output): + def get_model_size(self, model_size): + """ + Returns the whisper model size based on multilingual. """ - Format the current incomplete transcription output by combining it with previous complete segments. - The resulting transcription is wrapped into two lines, each containing a maximum of 50 characters. + if model_size not in self.model_sizes: + self.websocket.send( + json.dumps( + { + "uid": self.client_uid, + "status": "ERROR", + "message": f"Invalid model size {model_size}. Available choices: {self.model_sizes}" + } + ) + ) + return None + + if model_size == "large-v2": + self.multilingual = True + return model_size - It ensures that the combined transcription fits within two lines, with a maximum of 50 characters per line. - Segments are concatenated in the order they exist in the list of previous segments, with the most - recent complete segment first and older segments prepended as needed to maintain the character limit. - If a 3-second pause is detected in the previous segments, any text preceding it is discarded to ensure - the transcription starts with the most recent complete content. The resulting transcription is returned - as a single string. + if not self.multilingual: + model_size = model_size + ".en" - Args: - output(str): The current incomplete transcription segment. - - Returns: - str: A formatted transcription wrapped in two lines. - """ - text = '' - pick_prev = min(len(self.text), self.pick_previous_segments) - for seg in self.text[-pick_prev:]: - # discard everything before a 3 second pause - if seg == '': - text = '' - else: - text += seg - wrapped = "".join(text + output) - return wrapped + return model_size def add_frames(self, frame_np): """