Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert numbers to words to obtain timing data from load_align_model() #135

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion pytest/modules/Speech_Recognition/test_Whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import unittest
from src.modules.Speech_Recognition.TranscribedData import TranscribedData
from src.modules.Speech_Recognition.Whisper import convert_to_transcribed_data
from src.modules.Speech_Recognition.Whisper import convert_to_transcribed_data, any_number_to_words


class ConvertToTranscribedDataTest(unittest.TestCase):
Expand Down Expand Up @@ -51,6 +51,20 @@ def test_convert_to_transcribed_data(self):
self.assertEqual(transcribed_data[i].start, expected_output[i].start)
self.assertEqual(transcribed_data[i].is_hyphen, expected_output[i].is_hyphen)

def test_any_number_to_words_converts(self):
self.act_and_assert("I have 1 million dollars and 2 cents.", "I have one million dollars and two cents.")
self.act_and_assert("1 2 3 4 5", "one two three four five")
self.act_and_assert("1, 2, 3, 4, 5,", "one, two, three, four, five,")
self.act_and_assert("Hello world 1, 2!. 3. 4? Test 100#",
"Hello world one, two!. three. four? Test one hundred#")

def act_and_assert(self, text, expected_output):
# Act
result = any_number_to_words(text)

# Assert
self.assertEqual(result, expected_output)


if __name__ == "__main__":
unittest.main()
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ crepe~=0.0.13
demucs~=4.0.0
ffmpeg_python~=0.2.0
git+https://github.com/m-bain/whisperx.git
inflect~=7.2.0
langcodes~=3.3.0
language-data~=1.1
librosa~=0.9.2
Expand All @@ -23,4 +24,4 @@ black~=23.3
pylint~=2.17
pytest~=7.3.1
protobuf==3.20.*
packaging~=23.2
packaging~=23.2
46 changes: 38 additions & 8 deletions src/modules/Speech_Recognition/Whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,26 @@
from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted
from modules.Speech_Recognition.TranscribedData import TranscribedData

import re
import ast
import inflect

re_split_preserve_space = re.compile(r'(\d+|\W+|\w+)')
inflect_engine = inflect.engine()

def any_number_to_words(line):
# https://github.com/m-bain/whisperX
# Transcript words which do not contain characters in the alignment models dictionary e.g. "2014." or "£13.60" cannot be aligned and therefore are not given a timing.
# Therefore, convert numbers to words
out_tokens = []
in_tokens = re_split_preserve_space.findall(line)
for token in in_tokens:
try:
num = ast.literal_eval(token)
out_tokens.append(inflect_engine.number_to_words(num))
except Exception:
out_tokens.append(token)
return ''.join(out_tokens)

def transcribe_with_whisper(
audio_path: str,
Expand Down Expand Up @@ -80,6 +100,10 @@ def transcribe_with_whisper(
)
sys.exit(1)

# convert any numbers to words so align will have timing
for obj in result["segments"]:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It only works for English songs. With other languages you would have english words.

obj['text'] = any_number_to_words(obj['text'])

# align whisper output
result_aligned = whisperx.align(
result["segments"],
Expand All @@ -102,14 +126,20 @@ def convert_to_transcribed_data(result_aligned):
vtd = TranscribedData(obj) # create custom Word object
vtd.word = vtd.word + " " # add space to end of word
if len(obj) < 4:
previous = transcribed_data[-1]
if not previous:
previous.end = 0
previous.end = ""
vtd.start = previous.end + 0.1
vtd.end = previous.end + 0.2
msg = f'Error: There is no timestamp for word: "{obj["word"]}". ' \
f'Fixing it by placing it after the previous word: "{previous.word}". At start: {vtd.start} end: {vtd.end}. Fix it manually!'
if len(transcribed_data) == 0: # if the first word doesn't have any timing data
vtd.start = 0.0
vtd.end = 0.1
msg = f'Error: There is no timestamp for word: "{obj["word"]}". ' \
f'Fixing it by placing it at beginning. At start: {vtd.start} end: {vtd.end}. Fix it manually!'
else:
previous = transcribed_data[-1]
if not previous:
previous.end = 0
previous.end = ""
vtd.start = previous.end + 0.1
vtd.end = previous.end + 0.2
msg = f'Error: There is no timestamp for word: "{obj["word"]}". ' \
f'Fixing it by placing it after the previous word: "{previous.word}". At start: {vtd.start} end: {vtd.end}. Fix it manually!'
print(f"{red_highlighted(msg)}")
transcribed_data.append(vtd) # and add it to list
return transcribed_data
Loading