From b5b342fcc5e94e16318a195241388b2000752426 Mon Sep 17 00:00:00 2001 From: Miles B Silva Date: Mon, 18 Mar 2024 20:19:23 -0400 Subject: [PATCH 1/9] Add scripts for speech-to-text using whisper and stt+forced alignment with whisperX --- src/b2aiprep/speech_to_text.py | 66 ++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 src/b2aiprep/speech_to_text.py diff --git a/src/b2aiprep/speech_to_text.py b/src/b2aiprep/speech_to_text.py new file mode 100644 index 0000000..5ae3669 --- /dev/null +++ b/src/b2aiprep/speech_to_text.py @@ -0,0 +1,66 @@ +# Transcribes speech to text using OpenAI's whisper model +def transcribe_audio_whisper(audio_file_path, model="base"): + """ + Transcribes audio to text using OpenAI's whisper model. + + Args: + audio_file_path (str): Path to the audio file. + model (str): Model to use for transcription. Defaults to "base". + See https://github.com/openai/whisper/ for a list of all available models. + + Returns: + Result of the transcription. + """ + import whisper + + model = whisper.load_model(model) + result = model.transcribe(audio_file_path) + return result + + +# Transcribes speech to text using the whisperX model +def transcribe_audio_whisperx(audio_file_path, model="base", device="cuda", batch_size=16, + compute_type="float16", force_alignment=True, diarize=False, hf_token=None): + """ + Transcribes audio to text using OpenAI's whisper model. + + Args: + audio_file_path (str): Path to the audio file. + model (str): Model to use for transcription. Defaults to "base". + See https://github.com/openai/whisper/ for a list of all available models. + device (str): Device to use for computation. Defaults to "cuda". + batch_size (int): Batch size for transcription. Defaults to 16. + compute_type (str): Type of computation to use. Defaults to "float16". + Change to "int8" if low on GPU mem (may reduce accuracy) + + Returns: + Result of the transcription. + """ + import whisperx + + # 1. Transcribe with original whisper (batched) + model = whisperx.load_model(model, device, compute_type=compute_type) + + audio = whisperx.load_audio(audio_file_path) + result = model.transcribe(audio, batch_size=batch_size) + + if not force_alignment: + return result + + # 2. Align whisper output + model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) + result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) + + if not diarize: + return result + + # 3. Assign speaker labels + diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device) + + # add min/max number of speakers if known + diarize_segments = diarize_model(audio) + # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers) + + result = whisperx.assign_word_speakers(diarize_segments, result) + return result + From d9c90c42d30281c2cc119b2a9b87c0b765669304 Mon Sep 17 00:00:00 2001 From: Miles B Silva Date: Mon, 18 Mar 2024 20:28:25 -0400 Subject: [PATCH 2/9] Add scripts for speech-to-text using whisper and stt+forced alignment with whisperX --- src/b2aiprep/speech_to_text.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/b2aiprep/speech_to_text.py b/src/b2aiprep/speech_to_text.py index 5ae3669..1b096d1 100644 --- a/src/b2aiprep/speech_to_text.py +++ b/src/b2aiprep/speech_to_text.py @@ -44,23 +44,19 @@ def transcribe_audio_whisperx(audio_file_path, model="base", device="cuda", batc audio = whisperx.load_audio(audio_file_path) result = model.transcribe(audio, batch_size=batch_size) - if not force_alignment: - return result + if force_alignment: + # 2. Align whisper output + model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) + result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) - # 2. Align whisper output - model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) - result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) + if diarize: + # 3. Assign speaker labels + diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device) - if not diarize: - return result - - # 3. Assign speaker labels - diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device) - - # add min/max number of speakers if known - diarize_segments = diarize_model(audio) - # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers) + # add min/max number of speakers if known + diarize_segments = diarize_model(audio) + # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers) - result = whisperx.assign_word_speakers(diarize_segments, result) + result = whisperx.assign_word_speakers(diarize_segments, result) return result From d8db77fa7048642517411108c4a79dd5f3c9f583 Mon Sep 17 00:00:00 2001 From: Miles B Silva Date: Mon, 18 Mar 2024 20:32:52 -0400 Subject: [PATCH 3/9] Add args to function docstring --- src/b2aiprep/speech_to_text.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/b2aiprep/speech_to_text.py b/src/b2aiprep/speech_to_text.py index 1b096d1..5eb9f22 100644 --- a/src/b2aiprep/speech_to_text.py +++ b/src/b2aiprep/speech_to_text.py @@ -32,6 +32,9 @@ def transcribe_audio_whisperx(audio_file_path, model="base", device="cuda", batc batch_size (int): Batch size for transcription. Defaults to 16. compute_type (str): Type of computation to use. Defaults to "float16". Change to "int8" if low on GPU mem (may reduce accuracy) + force_alignment (bool): Whether or not to perform forced alignment of the speech-to-text output + diarize (bool): Whether or not to assign speaker labels to the text + hf_token (str): A Huggingface auth token, required to perform speaker diarization Returns: Result of the transcription. From 1717ade2128d5c03ae8a30f25200e3f4c3ab2261 Mon Sep 17 00:00:00 2001 From: Miles B Silva Date: Thu, 21 Mar 2024 21:11:27 -0400 Subject: [PATCH 4/9] Use Audio object instead of file paths; add requirements.txt --- requirements.txt | 5 +++++ src/b2aiprep/speech_to_text.py | 17 ++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d15fb81 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +faster-whisper==1.0.0 +openai-whisper==20231117 +torch==2.0.1 +torchaudio==2.0.2 +whisperx @ git+https://github.com/m-bain/whisperx.git@78dcfaab51005aa703ee21375f81ed31bc248560 diff --git a/src/b2aiprep/speech_to_text.py b/src/b2aiprep/speech_to_text.py index 5eb9f22..364c69a 100644 --- a/src/b2aiprep/speech_to_text.py +++ b/src/b2aiprep/speech_to_text.py @@ -1,10 +1,12 @@ +import typing as ty + # Transcribes speech to text using OpenAI's whisper model -def transcribe_audio_whisper(audio_file_path, model="base"): +def transcribe_audio_whisper(audio: Audio, model="base": str): """ Transcribes audio to text using OpenAI's whisper model. Args: - audio_file_path (str): Path to the audio file. + audio (Audio). Audio object. model (str): Model to use for transcription. Defaults to "base". See https://github.com/openai/whisper/ for a list of all available models. @@ -14,18 +16,18 @@ def transcribe_audio_whisper(audio_file_path, model="base"): import whisper model = whisper.load_model(model) - result = model.transcribe(audio_file_path) + result = model.transcribe(audio.signal.squeeze()) return result # Transcribes speech to text using the whisperX model -def transcribe_audio_whisperx(audio_file_path, model="base", device="cuda", batch_size=16, - compute_type="float16", force_alignment=True, diarize=False, hf_token=None): +def transcribe_audio_whisperx(audio: Audio, model="base": str, device="cuda": str, batch_size=16: int, + compute_type="float16": str, force_alignment=True: bool, diarize=False: bool, hf_token=None: ty.Optional[str]): """ Transcribes audio to text using OpenAI's whisper model. Args: - audio_file_path (str): Path to the audio file. + audio (audio): Audio object. model (str): Model to use for transcription. Defaults to "base". See https://github.com/openai/whisper/ for a list of all available models. device (str): Device to use for computation. Defaults to "cuda". @@ -44,7 +46,8 @@ def transcribe_audio_whisperx(audio_file_path, model="base", device="cuda", batc # 1. Transcribe with original whisper (batched) model = whisperx.load_model(model, device, compute_type=compute_type) - audio = whisperx.load_audio(audio_file_path) + # audio = whisperx.load_audio(audio.signal.squeeze()) + audio = audio.signal.squeeze().numpy() result = model.transcribe(audio, batch_size=batch_size) if force_alignment: From cf84b7955b35b8746b956dc9433c35f797273aba Mon Sep 17 00:00:00 2001 From: Miles B Silva Date: Thu, 21 Mar 2024 23:21:49 -0400 Subject: [PATCH 5/9] Move requirements to pyproject.toml and rename file --- pyproject.toml | 5 ++++- src/b2aiprep/{speech_to_text.py => speech2text.py} | 0 2 files changed, 4 insertions(+), 1 deletion(-) rename src/b2aiprep/{speech_to_text.py => speech2text.py} (100%) diff --git a/pyproject.toml b/pyproject.toml index 46a3fea..8af8853 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,10 @@ dependencies = [ "torchaudio>=2.0.0", "opensmile>=2.3.0", "click", - "pydra" + "pydra", + "openai-whisper", + "torch>=2.0.0", + "whisperx @ git+https://github.com/m-bain/whisperx.git@78dcfaab51005aa703ee21375f81ed31bc248560", ] [project.optional-dependencies] diff --git a/src/b2aiprep/speech_to_text.py b/src/b2aiprep/speech2text.py similarity index 100% rename from src/b2aiprep/speech_to_text.py rename to src/b2aiprep/speech2text.py From 8db9868cd8968f6e40ae9dd9f7455a783f557e59 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Wed, 27 Mar 2024 13:23:12 -0400 Subject: [PATCH 6/9] ref: add some dependencies and format annotations --- pyproject.toml | 1 - requirements.txt | 5 --- src/b2aiprep/speech2text.py | 65 ++++++++++++++++++++++++------------- 3 files changed, 42 insertions(+), 29 deletions(-) delete mode 100644 requirements.txt diff --git a/pyproject.toml b/pyproject.toml index 8af8853..20f8367 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,6 @@ dependencies = [ "click", "pydra", "openai-whisper", - "torch>=2.0.0", "whisperx @ git+https://github.com/m-bain/whisperx.git@78dcfaab51005aa703ee21375f81ed31bc248560", ] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d15fb81..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -faster-whisper==1.0.0 -openai-whisper==20231117 -torch==2.0.1 -torchaudio==2.0.2 -whisperx @ git+https://github.com/m-bain/whisperx.git@78dcfaab51005aa703ee21375f81ed31bc248560 diff --git a/src/b2aiprep/speech2text.py b/src/b2aiprep/speech2text.py index 364c69a..75c0da1 100644 --- a/src/b2aiprep/speech2text.py +++ b/src/b2aiprep/speech2text.py @@ -1,43 +1,60 @@ import typing as ty +from .process import Audio + + # Transcribes speech to text using OpenAI's whisper model -def transcribe_audio_whisper(audio: Audio, model="base": str): +def transcribe_audio_whisper(audio: Audio, model: str = "base", device: ty.Optional[str] = None): """ Transcribes audio to text using OpenAI's whisper model. Args: audio (Audio). Audio object. - model (str): Model to use for transcription. Defaults to "base". + model (str): Model to use for transcription. Defaults to "base". See https://github.com/openai/whisper/ for a list of all available models. - + Returns: Result of the transcription. """ import whisper - - model = whisper.load_model(model) + + model = whisper.load_model( + model, + device=device, + ) result = model.transcribe(audio.signal.squeeze()) return result # Transcribes speech to text using the whisperX model -def transcribe_audio_whisperx(audio: Audio, model="base": str, device="cuda": str, batch_size=16: int, - compute_type="float16": str, force_alignment=True: bool, diarize=False: bool, hf_token=None: ty.Optional[str]): +def transcribe_audio_whisperx( + audio: Audio, + hf_token: ty.Optional[str] = None, + model: str = "base", + device: ty.Optional[str] = None, + batch_size: int = 16, + compute_type: str = "float32", + force_alignment: bool = True, + diarize: bool = False, + min_speakers: ty.Optional[int] = None, + max_speakers: ty.Optional[int] = None, +): """ Transcribes audio to text using OpenAI's whisper model. Args: audio (audio): Audio object. - model (str): Model to use for transcription. Defaults to "base". + model (str): Model to use for transcription. Defaults to "base". See https://github.com/openai/whisper/ for a list of all available models. device (str): Device to use for computation. Defaults to "cuda". batch_size (int): Batch size for transcription. Defaults to 16. compute_type (str): Type of computation to use. Defaults to "float16". Change to "int8" if low on GPU mem (may reduce accuracy) - force_alignment (bool): Whether or not to perform forced alignment of the speech-to-text output - diarize (bool): Whether or not to assign speaker labels to the text - hf_token (str): A Huggingface auth token, required to perform speaker diarization - + force_alignment (bool): Whether or not to perform forced alignment of the + speech-to-text output + diarize (bool): Whether or not to assign speaker labels to the text + hf_token (str): A Huggingface auth token, required to perform speaker diarization + Returns: Result of the transcription. """ @@ -51,18 +68,20 @@ def transcribe_audio_whisperx(audio: Audio, model="base": str, device="cuda": st result = model.transcribe(audio, batch_size=batch_size) if force_alignment: - # 2. Align whisper output - model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) - result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) + # 2. Align whisper output + model_a, metadata = whisperx.load_align_model( + language_code=result["language"], device=device + ) + result = whisperx.align( + result["segments"], model_a, metadata, audio, device, return_char_alignments=False + ) if diarize: - # 3. Assign speaker labels - diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device) - - # add min/max number of speakers if known - diarize_segments = diarize_model(audio) - # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers) + # 3. Assign speaker labels + diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device) - result = whisperx.assign_word_speakers(diarize_segments, result) + # add min/max number of speakers if known + diarize_segments = diarize_model(audio) + diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers) + result = whisperx.assign_word_speakers(diarize_segments, result) return result - From 56c36bfa6c309bf6eb3c059f80f4c24a8f0b6736 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Wed, 27 Mar 2024 22:31:54 -0400 Subject: [PATCH 7/9] fix: adjust to 16KHz sample rate and allow for char level transcription --- src/b2aiprep/speech2text.py | 41 +++++++++++++------------------------ 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/src/b2aiprep/speech2text.py b/src/b2aiprep/speech2text.py index 75c0da1..1ff7f91 100644 --- a/src/b2aiprep/speech2text.py +++ b/src/b2aiprep/speech2text.py @@ -1,29 +1,9 @@ import typing as ty -from .process import Audio - - -# Transcribes speech to text using OpenAI's whisper model -def transcribe_audio_whisper(audio: Audio, model: str = "base", device: ty.Optional[str] = None): - """ - Transcribes audio to text using OpenAI's whisper model. +import torch +import whisperx - Args: - audio (Audio). Audio object. - model (str): Model to use for transcription. Defaults to "base". - See https://github.com/openai/whisper/ for a list of all available models. - - Returns: - Result of the transcription. - """ - import whisper - - model = whisper.load_model( - model, - device=device, - ) - result = model.transcribe(audio.signal.squeeze()) - return result +from .process import Audio # Transcribes speech to text using the whisperX model @@ -33,8 +13,9 @@ def transcribe_audio_whisperx( model: str = "base", device: ty.Optional[str] = None, batch_size: int = 16, - compute_type: str = "float32", + compute_type: ty.Optional[str] = None, force_alignment: bool = True, + return_char_alignments: bool = False, diarize: bool = False, min_speakers: ty.Optional[int] = None, max_speakers: ty.Optional[int] = None, @@ -58,12 +39,13 @@ def transcribe_audio_whisperx( Returns: Result of the transcription. """ - import whisperx # 1. Transcribe with original whisper (batched) + device = device or "cuda" if torch.cuda.is_available() else "cpu" model = whisperx.load_model(model, device, compute_type=compute_type) - # audio = whisperx.load_audio(audio.signal.squeeze()) + if audio.sample_rate != 16000: + audio = audio.to_16khz() audio = audio.signal.squeeze().numpy() result = model.transcribe(audio, batch_size=batch_size) @@ -73,7 +55,12 @@ def transcribe_audio_whisperx( language_code=result["language"], device=device ) result = whisperx.align( - result["segments"], model_a, metadata, audio, device, return_char_alignments=False + result["segments"], + model_a, + metadata, + audio, + device, + return_char_alignments=return_char_alignments, ) if diarize: From 0c4671a265b47c9b597cf13ae3544c87eca9765e Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Wed, 27 Mar 2024 22:52:43 -0400 Subject: [PATCH 8/9] tst: add whisperx test --- src/tests/test_speech_to_text.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/tests/test_speech_to_text.py b/src/tests/test_speech_to_text.py index 27326a4..1c6e0e4 100644 --- a/src/tests/test_speech_to_text.py +++ b/src/tests/test_speech_to_text.py @@ -3,6 +3,7 @@ import pytest from b2aiprep.process import Audio, SpeechToText +from b2aiprep.speech2text import transcribe_audio_whisperx def test_transcribe(): @@ -22,6 +23,21 @@ def test_transcribe(): assert text.strip() == audio_content +def test_transcribe_whisperx(): + """ + Validates SpeechToText's ability to convert audio to text accurately. + Checks if the transcription matches the expected output, considering known model discrepancies. + """ + audio_path = str((Path(__file__).parent.parent.parent / "data/vc_source.wav").absolute()) + audio_content = "If it isn't, it isn't." + + # Note: Should be "If it didn't, it didn't.", but that's what the model understands + audio = Audio.from_file(audio_path) + + result = transcribe_audio_whisperx(audio, model="tiny", device="cpu", compute_type="float32") + assert result["segments"][0]["text"].strip() == audio_content + + def test_cuda_not_available(): """ Test behavior when CUDA is not available. From 4a31e7706a85209993a6838155532862f978d375 Mon Sep 17 00:00:00 2001 From: Satrajit Ghosh Date: Wed, 27 Mar 2024 23:16:55 -0400 Subject: [PATCH 9/9] ref: restrict python for TTS --- pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ef6d241..d7a54d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,14 +19,14 @@ classifiers = [ "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ] -requires-python = ">=3.10" +requires-python = ">=3.10, <3.12" dependencies = [ "speechbrain>=1.0.0", "torchaudio>=2.0.0", "opensmile>=2.3.0", "matplotlib>=3.8.3", "click", - "whisperx @ git+https://github.com/m-bain/whisperx.git@78dcfaab51005aa703ee21375f81ed31bc248560", + "whisperx @ git+https://github.com/m-bain/whisperx.git@f2da2f858e99e4211fe4f64b5f2938b007827e17", "pydra~=0.23", "TTS", "accelerate", @@ -36,6 +36,7 @@ dependencies = [ [project.optional-dependencies] dev = [ "pytest", + "pre-commit" ] [project.scripts]