rakuri255 · BWagener · Sep 2, 2023 · Sep 3, 2023 · Sep 3, 2023 · Sep 3, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,9 +1,15 @@
+.env
+.vscode/
 # virtual environment
 venv
 .venv
 
 # python
 **__pycache__
 
+# test data
+test_input
+test_output
+
 # default output folder
-output
+output
diff --git a/README.md b/README.md
@@ -112,22 +112,29 @@ _Not all options working now!_
 
     [transcription]
     # Default is whisper
-    --whisper       Multilingual model > tiny|base|small|medium|large-v1|large-v2  >> ((default) is large-v2
-                    English-only model > tiny.en|base.en|small.en|medium.en
-    --align_model   Use other languages model for Whisper provided from huggingface.co 
-
-    --vosk          Needs model
+    --whisper               Multilingual model > tiny|base|small|medium|large-v1|large-v2  >> ((default) is large-v2
+                            English-only model > tiny.en|base.en|small.en|medium.en
+    --whisper_align_model   Use other languages model for Whisper provided from huggingface.co
+    --language              Override the language detected by whisper, does not affect transcription but steps after transcription
+    --whisper_batch_size    Reduce if low on GPU mem >> ((default) is 16)
+    --whisper_compute_type  Change to "int8" if low on GPU mem (may reduce accuracy) >> ((default) is "float16" for cuda devices, "int8" for cpu)
+    --vosk                  Needs model
 
     [pitcher]
     # Default is crepe
-    --crepe     tiny|small|medium|large|full >> ((default) is full)
+    --crepe            tiny|full >> ((default) is full)
+    --crepe_step_size  unit is miliseconds >> ((default) is 10)
+    --crepe_batch_size Reduce if low on GPU mem >> ((default) is 2048)
 
     [extra]
     --hyphenation           True|False >> ((default) is True)
     --disable_separation    True|False >> ((default) is False)
     --disable_karaoke       True|False >> ((default) is False)
     --create_audio_chunks   True|False >> ((default) is False)
     --plot                  True|False >> ((default) is False)
+    --force_whisper_cpu     True|False >> ((default) is False)
+    --force_separation_cpu  True|False >> ((default) is False)
+    --force_crepe_cpu       True|False >> ((default) is False)
 ```
 
 For standard use, you only need to use [opt]. All other options are optional.
@@ -205,7 +212,7 @@ starts at the place or is heard.
 
 ### Pitcher
 
-Pitching is done with the `crepe` model. 
+Pitching is done with the `crepe` model.
 Also consider that a bigger model is more accurate, but also takes longer to pitch.
 For just testing you should use `tiny`.
 If you want solid accurate, then use the `full` model.
@@ -254,7 +261,7 @@ When you want to use `conda` instead you need a different installation command.
 
 #### Info
 
-If somthing crash because of low VRAM than use a smaller model.
+If something crashes because of low VRAM than use a smaller model.
 Whisper needs more than 8GB VRAM in the `large` model!
 
-But you can force cpu usage with the extra options `--force_whisper_cpu` and `--force_separation_cpu`.
+But you can force cpu usage with the extra options `--force_whisper_cpu`, `--force_separation_cpu` and `--force_crepe_cpu`.
diff --git a/conftest.py b/conftest.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,5 @@
+[tool.isort]
+profile = "black"
 [tool.pytest.ini_options]
 pythonpath = [
   "src",

diff --git a/pytest/modules/Audio/test_denoise.py b/pytest/modules/Audio/test_denoise.py
@@ -0,0 +1,26 @@
+"""Tests for whisper.py"""
+
+import os
+import unittest
+from src.modules.Audio.denoise import ffmpeg_reduce_noise
+import pytest
+
+
+class DenoiseTest(unittest.TestCase):
+    @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
+    def test_ffmpeg_reduce_noise(self):
+        # Arrange
+        test_dir = os.path.dirname(os.path.abspath(__file__))
+        root_dir = os.path.abspath(test_dir + "/../../..")
+        test_file_abs_path = os.path.abspath(root_dir + "/test_input/vocals.wav")
+        test_file_name = os.path.basename(test_file_abs_path)
+        test_output = test_dir + "/test_output"
+
+        # Act
+        ffmpeg_reduce_noise(test_file_abs_path, test_output + "/output_" + test_file_name)
+
+        # Assert
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytest/modules/Pitcher/test_pitcher.py b/pytest/modules/Pitcher/test_pitcher.py
@@ -0,0 +1,26 @@
+"""Tests for pitcher.py"""
+
+import os
+import unittest
+import src.modules.Pitcher.pitcher as test_subject
+import pytest
+
+
+class PitcherTest(unittest.TestCase):
+    @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
+    def test_get_pitch_with_crepe_file(self):
+        # Arrange
+        test_dir = os.path.dirname(os.path.abspath(__file__))
+        root_dir = os.path.abspath(test_dir + "/../../..")
+        test_file_abs_path = os.path.abspath(root_dir + "/test_input/audio_denoised.wav")
+        test_file_name = os.path.basename(test_file_abs_path)
+        test_output = test_dir + "/test_output"
+
+        # Act
+        test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cuda', batch_size=1024)
+        # test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cpu', batch_size=1024)
+
+        print("done")
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/requirements.txt b/requirements.txt
@@ -1,25 +1,24 @@
-crepe==0.0.13
-librosa==0.9.2
+demucs~=4.0.0
+ffmpeg_python~=0.2.0
+git+https://github.com/m-bain/whisperx.git
+langcodes~=3.3.0
+language-data~=1.1
+librosa~=0.9.1
+matplotlib~=3.7.1
+musicbrainzngs~=0.7.1
+numpy~=1.23.5
+Pillow~=10.0.0
 pretty_midi~=0.2.10
 pydub==0.25.1
-Levenshtein~=0.21.0
-scipy~=1.10.1
+PyHyphen~=4.0.3
+python_Levenshtein~=0.21.1
+torchcrepe==0.0.21
+tqdm==4.65.0
 vosk==0.3.44
-tensorflow~=2.12.0
-#tensorflow-directml-plugin~=0.4.0.dev230202
-#cudatoolkit==11.8.0
-matplotlib~=3.7.1
+whisperx~=3.1.1
 yt_dlp~=2023.7.6
-numpy~=1.23.5
-git+https://github.com/m-bain/whisperx.git
-PyHyphen~=4.0.3
-demucs~=4.0.0
-tqdm~=4.65.0
-langcodes~=3.3.0
-language-data~=1.1
-Pillow~=9.5.0
+
 isort==5.12
 black==23.3
 pylint==2.17
-pytest~=7.3.1
-musicbrainzngs~=0.7.1
+pytest~=7.3.1
diff --git a/src/Settings.py b/src/Settings.py
@@ -10,6 +10,8 @@ class Settings:
     output_file_path = ""
     mono_audio_path = ""
 
+    language = None
+
     # Transcribe
     audio_chunk_folder_name = "audio-chunks"
 
@@ -18,15 +20,19 @@ class Settings:
     whisper_model = "large-v2"  # Multilingual model tiny|base|small|medium|large-v1|large-v2
     # English-only model tiny.en|base.en|small.en|medium.en
     whisper_align_model = None   # Model for other languages from huggingface.co e.g -> "gigant/romanian-wav2vec2"
+    whisper_batch_size = 16   # reduce if low on GPU mem
+    whisper_compute_type = None   # change to "int8" if low on GPU mem (may reduce accuracy)
 
     # Vosk
     vosk_model_path = ""  # "models/vosk-model-small-en-us-0.15"
 
     # Pitch
-    crepe_model_capacity = "full"  # tiny|small|medium|large|full
-    crepe_step_size = 10
+    crepe_model_capacity = "full"  # tiny|full
+    crepe_step_size = 10 # in miliseconds
+    crepe_batch_size = None # torchcrepe defaults to 2048, reduce if low on GPU mem
 
     # Device
     device = 'cpu'  # cpu|cuda
     force_whisper_cpu = False
     force_separation_cpu = False
+    force_crepe_cpu = False
diff --git a/src/UltraSinger.py b/src/UltraSinger.py
@@ -4,7 +4,6 @@
 import getopt
 import os
 import sys
-from time import sleep
 
 import Levenshtein
 import librosa
@@ -17,7 +16,7 @@
     export_chunks_from_transcribed_data,
     export_chunks_from_ultrastar_data,
 )
-from modules.Audio.silence_processing import remove_silence_from_transcribtion_data
+from modules.Audio.silence_processing import remove_silence_from_transcription_data
 from modules.csv_handler import export_transcribed_data_to_csv
 from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3
 from modules.Audio.youtube import (
@@ -83,6 +82,7 @@ def pitch_each_chunk_with_crepe(directory: str) -> list[str]:
         f"{ULTRASINGER_HEAD} Pitching each chunk with {blue_highlighted('crepe')}"
     )
 
+    device = "cpu" if settings.force_crepe_cpu else settings.device
     midi_notes = []
     for filename in sorted(
         [f for f in os.listdir(directory) if f.endswith(".wav")],
@@ -91,7 +91,11 @@ def pitch_each_chunk_with_crepe(directory: str) -> list[str]:
         filepath = os.path.join(directory, filename)
         # todo: stepsize = duration? then when shorter than "it" it should take the duration. Otherwise there a more notes
         pitched_data = get_pitch_with_crepe_file(
-            filepath, settings.crepe_step_size, settings.crepe_model_capacity
+            filepath,
+            settings.crepe_model_capacity,
+            device,
+            settings.crepe_batch_size,
+            settings.crepe_step_size,
         )
         conf_f = get_frequency_with_high_confidence(
             pitched_data.frequencies, pitched_data.confidence
@@ -230,7 +234,6 @@ def hyphenate_each_word(language: str, transcribed_data: list[TranscribedData])
         )
         return None
 
-    sleep(0.1)
     hyphenator = create_hyphenator(lang_region)
     for i in tqdm(enumerate(transcribed_data)):
         pos = i[0]
@@ -309,11 +312,14 @@ def run() -> None:
 
     # Audio transcription
     transcribed_data = None
-    language = None
+    language = settings.language
     if is_audio:
-        language, transcribed_data = transcribe_audio()
+        detected_language, transcribed_data = transcribe_audio()
+        if language is None:
+            language = detected_language
+
         remove_unecessary_punctuations(transcribed_data)
-        transcribed_data = remove_silence_from_transcribtion_data(
+        transcribed_data = remove_silence_from_transcription_data(
             settings.mono_audio_path, transcribed_data
         )
 
@@ -409,15 +415,14 @@ def transcribe_audio() -> (str, list[TranscribedData]):
     """Transcribe audio with AI"""
     if settings.transcriber == "whisper":
         device = "cpu" if settings.force_whisper_cpu else settings.device
-        transcribed_data, language = transcribe_with_whisper(
-            settings.mono_audio_path, settings.whisper_model, device, settings.whisper_align_model)
+        transcribed_data, detected_language = transcribe_with_whisper(
+            settings.mono_audio_path, settings.whisper_model, device, settings.whisper_align_model, settings.whisper_batch_size, settings.whisper_compute_type, settings.language)
     else:  # vosk
         transcribed_data = transcribe_with_vosk(
             settings.mono_audio_path, settings.vosk_model_path
         )
-        # todo: make language selectable
-        language = "en"
-    return language, transcribed_data
+        detected_language = "en"
+    return detected_language, transcribed_data
 
 
 def separate_vocal_from_audio(
@@ -427,16 +432,18 @@ def separate_vocal_from_audio(
     audio_separation_path = os.path.join(
         cache_path, "separated", "htdemucs", basename_without_ext
     )
-    device = "cpu" if settings.force_separation_cpu else settings.device
+
     if settings.use_separated_vocal or settings.create_karaoke:
+        device = "cpu" if settings.force_separation_cpu else settings.device
         separate_audio(ultrastar_audio_input_path, cache_path, device)
+
     if settings.use_separated_vocal:
-        vocals_path = os.path.join(audio_separation_path, "vocals.wav")
-        convert_audio_to_mono_wav(vocals_path, settings.mono_audio_path)
+        input_path = os.path.join(audio_separation_path, "vocals.wav")
     else:
-        convert_audio_to_mono_wav(
-            ultrastar_audio_input_path, settings.mono_audio_path
-        )
+        input_path = ultrastar_audio_input_path
+
+
+    convert_audio_to_mono_wav(input_path, settings.mono_audio_path)
     return audio_separation_path
 
 
@@ -695,10 +702,13 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast
     """Pitch audio"""
     # todo: chunk pitching as option?
     # midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name)
+    device = "cpu" if settings.force_crepe_cpu else settings.device
     pitched_data = get_pitch_with_crepe_file(
         settings.mono_audio_path,
-        settings.crepe_step_size,
         settings.crepe_model_capacity,
+        device,
+        settings.crepe_batch_size,
+        settings.crepe_step_size,
     )
     if is_audio:
         start_times = []
@@ -777,15 +787,27 @@ def init_settings(argv: list[str]) -> None:
         elif opt in ("--whisper"):
             settings.transcriber = "whisper"
             settings.whisper_model = arg
-        elif opt in ("--align_model"):
+        elif opt in ("--whisper_align_model"):
             settings.whisper_align_model = arg
+        elif opt in ("--whisper_batch_size"):
+            settings.whisper_batch_size = int(arg)
+        elif opt in ("--whisper_compute_type"):
+            settings.whisper_compute_type = arg
+        elif opt in ("--language"):
+            settings.language = arg
         elif opt in ("--vosk"):
             settings.transcriber = "vosk"
             settings.vosk_model_path = arg
         elif opt in ("--crepe"):
             settings.crepe_model_capacity = arg
+        elif opt in ("--crepe_step_size"):
+            settings.crepe_step_size = int(arg)
+        elif opt in ("--crepe_batch_size"):
+            settings.crepe_batch_size = int(arg)
         elif opt in ("--plot"):
-            settings.create_plot = arg
+            settings.create_plot = arg in ["True", "true"]
+        elif opt in ("--midi"):
+            settings.create_midi = arg in ["True", "true"]
         elif opt in ("--hyphenation"):
             settings.hyphenation = arg
         elif opt in ("--disable_separation"):
@@ -798,6 +820,8 @@ def init_settings(argv: list[str]) -> None:
             settings.force_whisper_cpu = arg
         elif opt in ("--force_separation_cpu"):
             settings.force_separation_cpu = arg
+        elif opt in ("--force_crepe_cpu"):
+            settings.force_crepe_cpu = arg
     if settings.output_file_path == "":
         if settings.input_file_path.startswith("https:"):
             dirname = os.getcwd()
@@ -813,16 +837,23 @@ def arg_options():
         "ifile=",
         "ofile=",
         "crepe=",
+        "crepe_step_size=",
+        "crepe_batch_size=",
         "vosk=",
         "whisper=",
-        "align_model=",
+        "whisper_align_model=",
+        "whisper_batch_size=",
+        "whisper_compute_type=",
+        "language=",
         "plot=",
+        "midi=",
         "hyphenation=",
         "disable_separation=",
         "disable_karaoke=",
         "create_audio_chunks=",
         "force_whisper_cpu=",
-        "force_separation_cpu="
+        "force_separation_cpu=",
+        "force_crepe_cpu=",
     ]
     return long, short