rakuri255 · louispan · Apr 6, 2024 · Jun 2, 2024 · Jun 2, 2024 · rakuri255
diff --git a/pytest/modules/Speech_Recognition/test_Whisper.py b/pytest/modules/Speech_Recognition/test_Whisper.py
@@ -2,7 +2,7 @@
 
 import unittest
 from src.modules.Speech_Recognition.TranscribedData import TranscribedData
-from src.modules.Speech_Recognition.Whisper import convert_to_transcribed_data
+from src.modules.Speech_Recognition.Whisper import convert_to_transcribed_data, any_number_to_words
 
 
 class ConvertToTranscribedDataTest(unittest.TestCase):
@@ -51,6 +51,20 @@ def test_convert_to_transcribed_data(self):
             self.assertEqual(transcribed_data[i].start, expected_output[i].start)
             self.assertEqual(transcribed_data[i].is_hyphen, expected_output[i].is_hyphen)
 
+    def test_any_number_to_words_converts(self):
+        self.act_and_assert("I have 1 million dollars and 2 cents.", "I have one million dollars and two cents.")
+        self.act_and_assert("1 2 3 4 5", "one two three four five")
+        self.act_and_assert("1, 2, 3, 4, 5,", "one, two, three, four, five,")
+        self.act_and_assert("Hello world 1, 2!. 3. 4? Test 100#",
+                            "Hello world one, two!. three. four? Test one hundred#")
+
+    def act_and_assert(self, text, expected_output):
+        # Act
+        result = any_number_to_words(text)
+
+        # Assert
+        self.assertEqual(result, expected_output)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,7 @@ crepe~=0.0.13
 demucs~=4.0.0
 ffmpeg_python~=0.2.0
 git+https://github.com/m-bain/whisperx.git
+inflect~=7.2.0
 langcodes~=3.3.0
 language-data~=1.1
 librosa~=0.9.2
@@ -23,4 +24,4 @@ black~=23.3
 pylint~=2.17
 pytest~=7.3.1
 protobuf==3.20.*
-packaging~=23.2
+packaging~=23.2
diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py
@@ -8,6 +8,26 @@
 from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted
 from modules.Speech_Recognition.TranscribedData import TranscribedData
 
+import re
+import ast
+import inflect
+
+re_split_preserve_space = re.compile(r'(\d+|\W+|\w+)')
+inflect_engine = inflect.engine()
+
+def any_number_to_words(line):
+    # https://github.com/m-bain/whisperX
+    # Transcript words which do not contain characters in the alignment models dictionary e.g. "2014." or "£13.60" cannot be aligned and therefore are not given a timing.
+    # Therefore, convert numbers to words
+    out_tokens = []
+    in_tokens = re_split_preserve_space.findall(line)
+    for token in in_tokens:
+        try:
+            num = ast.literal_eval(token)
+            out_tokens.append(inflect_engine.number_to_words(num))
+        except Exception:
+            out_tokens.append(token)
+    return ''.join(out_tokens)
 
 def transcribe_with_whisper(
     audio_path: str,
@@ -80,6 +100,10 @@ def transcribe_with_whisper(
         )
         sys.exit(1)
 
+    # convert any numbers to words so align will have timing
+    for obj in result["segments"]:
+        obj['text'] = any_number_to_words(obj['text'])
+
     # align whisper output
     result_aligned = whisperx.align(
         result["segments"],
@@ -102,14 +126,20 @@ def convert_to_transcribed_data(result_aligned):
             vtd = TranscribedData(obj)  # create custom Word object
             vtd.word = vtd.word + " "  # add space to end of word
             if len(obj) < 4:
-                previous = transcribed_data[-1]
-                if not previous:
-                    previous.end = 0
-                    previous.end = ""
-                vtd.start = previous.end + 0.1
-                vtd.end = previous.end + 0.2
-                msg = f'Error: There is no timestamp for word: "{obj["word"]}". ' \
-                      f'Fixing it by placing it after the previous word: "{previous.word}". At start: {vtd.start} end: {vtd.end}. Fix it manually!'
+                if len(transcribed_data) == 0: # if the first word doesn't have any timing data
+                    vtd.start = 0.0
+                    vtd.end = 0.1
+                    msg = f'Error: There is no timestamp for word: "{obj["word"]}". ' \
+                        f'Fixing it by placing it at beginning. At start: {vtd.start} end: {vtd.end}. Fix it manually!'
+                else:
+                    previous = transcribed_data[-1]
+                    if not previous:
+                        previous.end = 0
+                        previous.end = ""
+                    vtd.start = previous.end + 0.1
+                    vtd.end = previous.end + 0.2
+                    msg = f'Error: There is no timestamp for word: "{obj["word"]}". ' \
+                        f'Fixing it by placing it after the previous word: "{previous.word}". At start: {vtd.start} end: {vtd.end}. Fix it manually!'
                 print(f"{red_highlighted(msg)}")
             transcribed_data.append(vtd)  # and add it to list
     return transcribed_data