From b5b342fcc5e94e16318a195241388b2000752426 Mon Sep 17 00:00:00 2001
From: Miles B Silva <mbsilva@openmind7.mit.edu>
Date: Mon, 18 Mar 2024 20:19:23 -0400
Subject: [PATCH 1/9] Add scripts for speech-to-text using whisper and
 stt+forced alignment with whisperX

---
 src/b2aiprep/speech_to_text.py | 66 ++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 src/b2aiprep/speech_to_text.py

diff --git a/src/b2aiprep/speech_to_text.py b/src/b2aiprep/speech_to_text.py
new file mode 100644
index 0000000..5ae3669
--- /dev/null
+++ b/src/b2aiprep/speech_to_text.py
@@ -0,0 +1,66 @@
+# Transcribes speech to text using OpenAI's whisper model
+def transcribe_audio_whisper(audio_file_path, model="base"):
+    """
+    Transcribes audio to text using OpenAI's whisper model.
+
+    Args:
+        audio_file_path (str): Path to the audio file.
+        model (str): Model to use for transcription. Defaults to "base". 
+            See https://github.com/openai/whisper/ for a list of all available models.
+    
+    Returns:
+        Result of the transcription.
+    """
+    import whisper
+    
+    model = whisper.load_model(model)
+    result = model.transcribe(audio_file_path)
+    return result
+
+
+# Transcribes speech to text using the whisperX model
+def transcribe_audio_whisperx(audio_file_path, model="base", device="cuda", batch_size=16, 
+                              compute_type="float16", force_alignment=True, diarize=False, hf_token=None):
+    """
+    Transcribes audio to text using OpenAI's whisper model.
+
+    Args:
+        audio_file_path (str): Path to the audio file.
+        model (str): Model to use for transcription. Defaults to "base". 
+            See https://github.com/openai/whisper/ for a list of all available models.
+        device (str): Device to use for computation. Defaults to "cuda".
+        batch_size (int): Batch size for transcription. Defaults to 16.
+        compute_type (str): Type of computation to use. Defaults to "float16".
+            Change to "int8" if low on GPU mem (may reduce accuracy)
+    
+    Returns:
+        Result of the transcription.
+    """
+    import whisperx
+
+    # 1. Transcribe with original whisper (batched)
+    model = whisperx.load_model(model, device, compute_type=compute_type)
+
+    audio = whisperx.load_audio(audio_file_path)
+    result = model.transcribe(audio, batch_size=batch_size)
+
+    if not force_alignment:
+        return result
+
+    # 2. Align whisper output
+    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+
+    if not diarize:
+        return result
+    
+    # 3. Assign speaker labels
+    diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
+
+    # add min/max number of speakers if known
+    diarize_segments = diarize_model(audio)
+    # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
+
+    result = whisperx.assign_word_speakers(diarize_segments, result)
+    return result
+

From d9c90c42d30281c2cc119b2a9b87c0b765669304 Mon Sep 17 00:00:00 2001
From: Miles B Silva <mbsilva@openmind7.mit.edu>
Date: Mon, 18 Mar 2024 20:28:25 -0400
Subject: [PATCH 2/9] Add scripts for speech-to-text using whisper and
 stt+forced alignment with whisperX

---
 src/b2aiprep/speech_to_text.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/b2aiprep/speech_to_text.py b/src/b2aiprep/speech_to_text.py
index 5ae3669..1b096d1 100644
--- a/src/b2aiprep/speech_to_text.py
+++ b/src/b2aiprep/speech_to_text.py
@@ -44,23 +44,19 @@ def transcribe_audio_whisperx(audio_file_path, model="base", device="cuda", batc
     audio = whisperx.load_audio(audio_file_path)
     result = model.transcribe(audio, batch_size=batch_size)
 
-    if not force_alignment:
-        return result
+    if force_alignment:
+	    # 2. Align whisper output
+	    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+	    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
 
-    # 2. Align whisper output
-    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+    if diarize:
+	    # 3. Assign speaker labels
+	    diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
 
-    if not diarize:
-        return result
-    
-    # 3. Assign speaker labels
-    diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
-
-    # add min/max number of speakers if known
-    diarize_segments = diarize_model(audio)
-    # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
+	    # add min/max number of speakers if known
+	    diarize_segments = diarize_model(audio)
+	    # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
 
-    result = whisperx.assign_word_speakers(diarize_segments, result)
+	    result = whisperx.assign_word_speakers(diarize_segments, result)
     return result
 

From d8db77fa7048642517411108c4a79dd5f3c9f583 Mon Sep 17 00:00:00 2001
From: Miles B Silva <mbsilva@openmind7.mit.edu>
Date: Mon, 18 Mar 2024 20:32:52 -0400
Subject: [PATCH 3/9] Add args to function docstring

---
 src/b2aiprep/speech_to_text.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/b2aiprep/speech_to_text.py b/src/b2aiprep/speech_to_text.py
index 1b096d1..5eb9f22 100644
--- a/src/b2aiprep/speech_to_text.py
+++ b/src/b2aiprep/speech_to_text.py
@@ -32,6 +32,9 @@ def transcribe_audio_whisperx(audio_file_path, model="base", device="cuda", batc
         batch_size (int): Batch size for transcription. Defaults to 16.
         compute_type (str): Type of computation to use. Defaults to "float16".
             Change to "int8" if low on GPU mem (may reduce accuracy)
+	force_alignment (bool): Whether or not to perform forced alignment of the speech-to-text output
+	diarize (bool): Whether or not to assign speaker labels to the text
+	hf_token (str): A Huggingface auth token, required to perform speaker diarization
     
     Returns:
         Result of the transcription.

From 1717ade2128d5c03ae8a30f25200e3f4c3ab2261 Mon Sep 17 00:00:00 2001
From: Miles B Silva <mbsilva@node059.ib.cluster>
Date: Thu, 21 Mar 2024 21:11:27 -0400
Subject: [PATCH 4/9] Use Audio object instead of file paths; add
 requirements.txt

---
 requirements.txt               |  5 +++++
 src/b2aiprep/speech_to_text.py | 17 ++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..d15fb81
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+faster-whisper==1.0.0
+openai-whisper==20231117
+torch==2.0.1
+torchaudio==2.0.2
+whisperx @ git+https://github.com/m-bain/whisperx.git@78dcfaab51005aa703ee21375f81ed31bc248560
diff --git a/src/b2aiprep/speech_to_text.py b/src/b2aiprep/speech_to_text.py
index 5eb9f22..364c69a 100644
--- a/src/b2aiprep/speech_to_text.py
+++ b/src/b2aiprep/speech_to_text.py
@@ -1,10 +1,12 @@
+import typing as ty
+
 # Transcribes speech to text using OpenAI's whisper model
-def transcribe_audio_whisper(audio_file_path, model="base"):
+def transcribe_audio_whisper(audio: Audio, model="base": str):
     """
     Transcribes audio to text using OpenAI's whisper model.
 
     Args:
-        audio_file_path (str): Path to the audio file.
+        audio (Audio). Audio object.
         model (str): Model to use for transcription. Defaults to "base". 
             See https://github.com/openai/whisper/ for a list of all available models.
     
@@ -14,18 +16,18 @@ def transcribe_audio_whisper(audio_file_path, model="base"):
     import whisper
     
     model = whisper.load_model(model)
-    result = model.transcribe(audio_file_path)
+    result = model.transcribe(audio.signal.squeeze())
     return result
 
 
 # Transcribes speech to text using the whisperX model
-def transcribe_audio_whisperx(audio_file_path, model="base", device="cuda", batch_size=16, 
-                              compute_type="float16", force_alignment=True, diarize=False, hf_token=None):
+def transcribe_audio_whisperx(audio: Audio, model="base": str, device="cuda": str, batch_size=16: int, 
+        compute_type="float16": str, force_alignment=True: bool, diarize=False: bool, hf_token=None: ty.Optional[str]):
     """
     Transcribes audio to text using OpenAI's whisper model.
 
     Args:
-        audio_file_path (str): Path to the audio file.
+        audio (audio): Audio object.
         model (str): Model to use for transcription. Defaults to "base". 
             See https://github.com/openai/whisper/ for a list of all available models.
         device (str): Device to use for computation. Defaults to "cuda".
@@ -44,7 +46,8 @@ def transcribe_audio_whisperx(audio_file_path, model="base", device="cuda", batc
     # 1. Transcribe with original whisper (batched)
     model = whisperx.load_model(model, device, compute_type=compute_type)
 
-    audio = whisperx.load_audio(audio_file_path)
+    # audio = whisperx.load_audio(audio.signal.squeeze())
+    audio = audio.signal.squeeze().numpy()
     result = model.transcribe(audio, batch_size=batch_size)
 
     if force_alignment:

From cf84b7955b35b8746b956dc9433c35f797273aba Mon Sep 17 00:00:00 2001
From: Miles B Silva <mbsilva@node059.ib.cluster>
Date: Thu, 21 Mar 2024 23:21:49 -0400
Subject: [PATCH 5/9] Move requirements to pyproject.toml and rename file

---
 pyproject.toml                                     | 5 ++++-
 src/b2aiprep/{speech_to_text.py => speech2text.py} | 0
 2 files changed, 4 insertions(+), 1 deletion(-)
 rename src/b2aiprep/{speech_to_text.py => speech2text.py} (100%)

diff --git a/pyproject.toml b/pyproject.toml
index 46a3fea..8af8853 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,10 @@ dependencies = [
     "torchaudio>=2.0.0",
     "opensmile>=2.3.0",
     "click",
-    "pydra"
+    "pydra",
+    "openai-whisper",
+    "torch>=2.0.0",
+    "whisperx @ git+https://github.com/m-bain/whisperx.git@78dcfaab51005aa703ee21375f81ed31bc248560",
 ]
 
 [project.optional-dependencies]
diff --git a/src/b2aiprep/speech_to_text.py b/src/b2aiprep/speech2text.py
similarity index 100%
rename from src/b2aiprep/speech_to_text.py
rename to src/b2aiprep/speech2text.py

From 8db9868cd8968f6e40ae9dd9f7455a783f557e59 Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Wed, 27 Mar 2024 13:23:12 -0400
Subject: [PATCH 6/9] ref: add some dependencies and format annotations

---
 pyproject.toml              |  1 -
 requirements.txt            |  5 ---
 src/b2aiprep/speech2text.py | 65 ++++++++++++++++++++++++-------------
 3 files changed, 42 insertions(+), 29 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/pyproject.toml b/pyproject.toml
index 8af8853..20f8367 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,6 @@ dependencies = [
     "click",
     "pydra",
     "openai-whisper",
-    "torch>=2.0.0",
     "whisperx @ git+https://github.com/m-bain/whisperx.git@78dcfaab51005aa703ee21375f81ed31bc248560",
 ]
 
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index d15fb81..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-faster-whisper==1.0.0
-openai-whisper==20231117
-torch==2.0.1
-torchaudio==2.0.2
-whisperx @ git+https://github.com/m-bain/whisperx.git@78dcfaab51005aa703ee21375f81ed31bc248560
diff --git a/src/b2aiprep/speech2text.py b/src/b2aiprep/speech2text.py
index 364c69a..75c0da1 100644
--- a/src/b2aiprep/speech2text.py
+++ b/src/b2aiprep/speech2text.py
@@ -1,43 +1,60 @@
 import typing as ty
 
+from .process import Audio
+
+
 # Transcribes speech to text using OpenAI's whisper model
-def transcribe_audio_whisper(audio: Audio, model="base": str):
+def transcribe_audio_whisper(audio: Audio, model: str = "base", device: ty.Optional[str] = None):
     """
     Transcribes audio to text using OpenAI's whisper model.
 
     Args:
         audio (Audio). Audio object.
-        model (str): Model to use for transcription. Defaults to "base". 
+        model (str): Model to use for transcription. Defaults to "base".
             See https://github.com/openai/whisper/ for a list of all available models.
-    
+
     Returns:
         Result of the transcription.
     """
     import whisper
-    
-    model = whisper.load_model(model)
+
+    model = whisper.load_model(
+        model,
+        device=device,
+    )
     result = model.transcribe(audio.signal.squeeze())
     return result
 
 
 # Transcribes speech to text using the whisperX model
-def transcribe_audio_whisperx(audio: Audio, model="base": str, device="cuda": str, batch_size=16: int, 
-        compute_type="float16": str, force_alignment=True: bool, diarize=False: bool, hf_token=None: ty.Optional[str]):
+def transcribe_audio_whisperx(
+    audio: Audio,
+    hf_token: ty.Optional[str] = None,
+    model: str = "base",
+    device: ty.Optional[str] = None,
+    batch_size: int = 16,
+    compute_type: str = "float32",
+    force_alignment: bool = True,
+    diarize: bool = False,
+    min_speakers: ty.Optional[int] = None,
+    max_speakers: ty.Optional[int] = None,
+):
     """
     Transcribes audio to text using OpenAI's whisper model.
 
     Args:
         audio (audio): Audio object.
-        model (str): Model to use for transcription. Defaults to "base". 
+        model (str): Model to use for transcription. Defaults to "base".
             See https://github.com/openai/whisper/ for a list of all available models.
         device (str): Device to use for computation. Defaults to "cuda".
         batch_size (int): Batch size for transcription. Defaults to 16.
         compute_type (str): Type of computation to use. Defaults to "float16".
             Change to "int8" if low on GPU mem (may reduce accuracy)
-	force_alignment (bool): Whether or not to perform forced alignment of the speech-to-text output
-	diarize (bool): Whether or not to assign speaker labels to the text
-	hf_token (str): A Huggingface auth token, required to perform speaker diarization
-    
+        force_alignment (bool): Whether or not to perform forced alignment of the
+            speech-to-text output
+        diarize (bool): Whether or not to assign speaker labels to the text
+        hf_token (str): A Huggingface auth token, required to perform speaker diarization
+
     Returns:
         Result of the transcription.
     """
@@ -51,18 +68,20 @@ def transcribe_audio_whisperx(audio: Audio, model="base": str, device="cuda": st
     result = model.transcribe(audio, batch_size=batch_size)
 
     if force_alignment:
-	    # 2. Align whisper output
-	    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-	    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+        # 2. Align whisper output
+        model_a, metadata = whisperx.load_align_model(
+            language_code=result["language"], device=device
+        )
+        result = whisperx.align(
+            result["segments"], model_a, metadata, audio, device, return_char_alignments=False
+        )
 
     if diarize:
-	    # 3. Assign speaker labels
-	    diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
-
-	    # add min/max number of speakers if known
-	    diarize_segments = diarize_model(audio)
-	    # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
+        # 3. Assign speaker labels
+        diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
 
-	    result = whisperx.assign_word_speakers(diarize_segments, result)
+        # add min/max number of speakers if known
+        diarize_segments = diarize_model(audio)
+        diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
+        result = whisperx.assign_word_speakers(diarize_segments, result)
     return result
-

From 56c36bfa6c309bf6eb3c059f80f4c24a8f0b6736 Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Wed, 27 Mar 2024 22:31:54 -0400
Subject: [PATCH 7/9] fix: adjust to 16KHz sample rate and allow for char level
 transcription

---
 src/b2aiprep/speech2text.py | 41 +++++++++++++------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/src/b2aiprep/speech2text.py b/src/b2aiprep/speech2text.py
index 75c0da1..1ff7f91 100644
--- a/src/b2aiprep/speech2text.py
+++ b/src/b2aiprep/speech2text.py
@@ -1,29 +1,9 @@
 import typing as ty
 
-from .process import Audio
-
-
-# Transcribes speech to text using OpenAI's whisper model
-def transcribe_audio_whisper(audio: Audio, model: str = "base", device: ty.Optional[str] = None):
-    """
-    Transcribes audio to text using OpenAI's whisper model.
+import torch
+import whisperx
 
-    Args:
-        audio (Audio). Audio object.
-        model (str): Model to use for transcription. Defaults to "base".
-            See https://github.com/openai/whisper/ for a list of all available models.
-
-    Returns:
-        Result of the transcription.
-    """
-    import whisper
-
-    model = whisper.load_model(
-        model,
-        device=device,
-    )
-    result = model.transcribe(audio.signal.squeeze())
-    return result
+from .process import Audio
 
 
 # Transcribes speech to text using the whisperX model
@@ -33,8 +13,9 @@ def transcribe_audio_whisperx(
     model: str = "base",
     device: ty.Optional[str] = None,
     batch_size: int = 16,
-    compute_type: str = "float32",
+    compute_type: ty.Optional[str] = None,
     force_alignment: bool = True,
+    return_char_alignments: bool = False,
     diarize: bool = False,
     min_speakers: ty.Optional[int] = None,
     max_speakers: ty.Optional[int] = None,
@@ -58,12 +39,13 @@ def transcribe_audio_whisperx(
     Returns:
         Result of the transcription.
     """
-    import whisperx
 
     # 1. Transcribe with original whisper (batched)
+    device = device or "cuda" if torch.cuda.is_available() else "cpu"
     model = whisperx.load_model(model, device, compute_type=compute_type)
 
-    # audio = whisperx.load_audio(audio.signal.squeeze())
+    if audio.sample_rate != 16000:
+        audio = audio.to_16khz()
     audio = audio.signal.squeeze().numpy()
     result = model.transcribe(audio, batch_size=batch_size)
 
@@ -73,7 +55,12 @@ def transcribe_audio_whisperx(
             language_code=result["language"], device=device
         )
         result = whisperx.align(
-            result["segments"], model_a, metadata, audio, device, return_char_alignments=False
+            result["segments"],
+            model_a,
+            metadata,
+            audio,
+            device,
+            return_char_alignments=return_char_alignments,
         )
 
     if diarize:

From 0c4671a265b47c9b597cf13ae3544c87eca9765e Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Wed, 27 Mar 2024 22:52:43 -0400
Subject: [PATCH 8/9] tst: add whisperx test

---
 src/tests/test_speech_to_text.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/tests/test_speech_to_text.py b/src/tests/test_speech_to_text.py
index 27326a4..1c6e0e4 100644
--- a/src/tests/test_speech_to_text.py
+++ b/src/tests/test_speech_to_text.py
@@ -3,6 +3,7 @@
 import pytest
 
 from b2aiprep.process import Audio, SpeechToText
+from b2aiprep.speech2text import transcribe_audio_whisperx
 
 
 def test_transcribe():
@@ -22,6 +23,21 @@ def test_transcribe():
     assert text.strip() == audio_content
 
 
+def test_transcribe_whisperx():
+    """
+    Validates SpeechToText's ability to convert audio to text accurately.
+    Checks if the transcription matches the expected output, considering known model discrepancies.
+    """
+    audio_path = str((Path(__file__).parent.parent.parent / "data/vc_source.wav").absolute())
+    audio_content = "If it isn't, it isn't."
+
+    # Note: Should be "If it didn't, it didn't.", but that's what the model understands
+    audio = Audio.from_file(audio_path)
+
+    result = transcribe_audio_whisperx(audio, model="tiny", device="cpu", compute_type="float32")
+    assert result["segments"][0]["text"].strip() == audio_content
+
+
 def test_cuda_not_available():
     """
     Test behavior when CUDA is not available.

From 4a31e7706a85209993a6838155532862f978d375 Mon Sep 17 00:00:00 2001
From: Satrajit Ghosh <satra@mit.edu>
Date: Wed, 27 Mar 2024 23:16:55 -0400
Subject: [PATCH 9/9] ref: restrict python for TTS

---
 pyproject.toml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ef6d241..d7a54d3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,14 +19,14 @@ classifiers = [
     "License :: OSI Approved :: Apache Software License",
     "Operating System :: OS Independent",
 ]
-requires-python = ">=3.10"
+requires-python = ">=3.10, <3.12"
 dependencies = [
     "speechbrain>=1.0.0",
     "torchaudio>=2.0.0",
     "opensmile>=2.3.0",
     "matplotlib>=3.8.3",
     "click",
-    "whisperx @ git+https://github.com/m-bain/whisperx.git@78dcfaab51005aa703ee21375f81ed31bc248560",
+    "whisperx @ git+https://github.com/m-bain/whisperx.git@f2da2f858e99e4211fe4f64b5f2938b007827e17",
     "pydra~=0.23",
     "TTS",
     "accelerate",
@@ -36,6 +36,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
   "pytest",
+  "pre-commit"
 ]
 
 [project.scripts]