Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add torchcrepe #87

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
.env
.vscode/
# virtual environment
venv
.venv

# python
**__pycache__

# test data
test_input
test_output

# default output folder
output
output
25 changes: 16 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,22 +112,29 @@ _Not all options working now!_

[transcription]
# Default is whisper
--whisper Multilingual model > tiny|base|small|medium|large-v1|large-v2 >> ((default) is large-v2
English-only model > tiny.en|base.en|small.en|medium.en
--align_model Use other languages model for Whisper provided from huggingface.co

--vosk Needs model
--whisper Multilingual model > tiny|base|small|medium|large-v1|large-v2 >> ((default) is large-v2
English-only model > tiny.en|base.en|small.en|medium.en
--whisper_align_model Use other languages model for Whisper provided from huggingface.co
--language Override the language detected by whisper, does not affect transcription but steps after transcription
--whisper_batch_size Reduce if low on GPU mem >> ((default) is 16)
--whisper_compute_type Change to "int8" if low on GPU mem (may reduce accuracy) >> ((default) is "float16" for cuda devices, "int8" for cpu)
--vosk Needs model

[pitcher]
# Default is crepe
--crepe tiny|small|medium|large|full >> ((default) is full)
--crepe tiny|full >> ((default) is full)
--crepe_step_size unit is miliseconds >> ((default) is 10)
--crepe_batch_size Reduce if low on GPU mem >> ((default) is 2048)

[extra]
--hyphenation True|False >> ((default) is True)
--disable_separation True|False >> ((default) is False)
--disable_karaoke True|False >> ((default) is False)
--create_audio_chunks True|False >> ((default) is False)
--plot True|False >> ((default) is False)
--force_whisper_cpu True|False >> ((default) is False)
--force_separation_cpu True|False >> ((default) is False)
--force_crepe_cpu True|False >> ((default) is False)
```

For standard use, you only need to use [opt]. All other options are optional.
Expand Down Expand Up @@ -205,7 +212,7 @@ starts at the place or is heard.

### Pitcher

Pitching is done with the `crepe` model.
Pitching is done with the `crepe` model.
Also consider that a bigger model is more accurate, but also takes longer to pitch.
For just testing you should use `tiny`.
If you want solid accurate, then use the `full` model.
Expand Down Expand Up @@ -254,7 +261,7 @@ When you want to use `conda` instead you need a different installation command.

#### Info

If somthing crash because of low VRAM than use a smaller model.
If something crashes because of low VRAM than use a smaller model.
Whisper needs more than 8GB VRAM in the `large` model!

But you can force cpu usage with the extra options `--force_whisper_cpu` and `--force_separation_cpu`.
But you can force cpu usage with the extra options `--force_whisper_cpu`, `--force_separation_cpu` and `--force_crepe_cpu`.
Empty file added conftest.py
Empty file.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
[tool.isort]
profile = "black"
[tool.pytest.ini_options]
pythonpath = [
"src",
Expand Down
26 changes: 26 additions & 0 deletions pytest/modules/Audio/test_denoise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Tests for whisper.py"""

import os
import unittest
from src.modules.Audio.denoise import ffmpeg_reduce_noise
import pytest


class DenoiseTest(unittest.TestCase):
@pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
def test_ffmpeg_reduce_noise(self):
# Arrange
test_dir = os.path.dirname(os.path.abspath(__file__))
root_dir = os.path.abspath(test_dir + "/../../..")
test_file_abs_path = os.path.abspath(root_dir + "/test_input/vocals.wav")
test_file_name = os.path.basename(test_file_abs_path)
test_output = test_dir + "/test_output"

# Act
ffmpeg_reduce_noise(test_file_abs_path, test_output + "/output_" + test_file_name)

# Assert


if __name__ == "__main__":
unittest.main()
26 changes: 26 additions & 0 deletions pytest/modules/Pitcher/test_pitcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Tests for pitcher.py"""

import os
import unittest
import src.modules.Pitcher.pitcher as test_subject
import pytest


class PitcherTest(unittest.TestCase):
@pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
def test_get_pitch_with_crepe_file(self):
# Arrange
test_dir = os.path.dirname(os.path.abspath(__file__))
root_dir = os.path.abspath(test_dir + "/../../..")
test_file_abs_path = os.path.abspath(root_dir + "/test_input/audio_denoised.wav")
test_file_name = os.path.basename(test_file_abs_path)
test_output = test_dir + "/test_output"

# Act
test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cuda', batch_size=1024)
# test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cpu', batch_size=1024)

print("done")

if __name__ == "__main__":
unittest.main()
35 changes: 17 additions & 18 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,25 +1,24 @@
crepe==0.0.13
librosa==0.9.2
demucs~=4.0.0
ffmpeg_python~=0.2.0
git+https://github.com/m-bain/whisperx.git
langcodes~=3.3.0
language-data~=1.1
librosa~=0.9.1
matplotlib~=3.7.1
musicbrainzngs~=0.7.1
numpy~=1.23.5
Pillow~=10.0.0
pretty_midi~=0.2.10
pydub==0.25.1
Levenshtein~=0.21.0
scipy~=1.10.1
PyHyphen~=4.0.3
python_Levenshtein~=0.21.1
torchcrepe==0.0.21
tqdm==4.65.0
vosk==0.3.44
tensorflow~=2.12.0
#tensorflow-directml-plugin~=0.4.0.dev230202
#cudatoolkit==11.8.0
matplotlib~=3.7.1
whisperx~=3.1.1
yt_dlp~=2023.7.6
numpy~=1.23.5
git+https://github.com/m-bain/whisperx.git
PyHyphen~=4.0.3
demucs~=4.0.0
tqdm~=4.65.0
langcodes~=3.3.0
language-data~=1.1
Pillow~=9.5.0

isort==5.12
black==23.3
pylint==2.17
pytest~=7.3.1
musicbrainzngs~=0.7.1
pytest~=7.3.1
10 changes: 8 additions & 2 deletions src/Settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ class Settings:
output_file_path = ""
mono_audio_path = ""

language = None

# Transcribe
audio_chunk_folder_name = "audio-chunks"

Expand All @@ -18,15 +20,19 @@ class Settings:
whisper_model = "large-v2" # Multilingual model tiny|base|small|medium|large-v1|large-v2
# English-only model tiny.en|base.en|small.en|medium.en
whisper_align_model = None # Model for other languages from huggingface.co e.g -> "gigant/romanian-wav2vec2"
whisper_batch_size = 16 # reduce if low on GPU mem
whisper_compute_type = None # change to "int8" if low on GPU mem (may reduce accuracy)

# Vosk
vosk_model_path = "" # "models/vosk-model-small-en-us-0.15"

# Pitch
crepe_model_capacity = "full" # tiny|small|medium|large|full
crepe_step_size = 10
crepe_model_capacity = "full" # tiny|full
crepe_step_size = 10 # in miliseconds
crepe_batch_size = None # torchcrepe defaults to 2048, reduce if low on GPU mem

# Device
device = 'cpu' # cpu|cuda
force_whisper_cpu = False
force_separation_cpu = False
force_crepe_cpu = False
77 changes: 54 additions & 23 deletions src/UltraSinger.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import getopt
import os
import sys
from time import sleep

import Levenshtein
import librosa
Expand All @@ -17,7 +16,7 @@
export_chunks_from_transcribed_data,
export_chunks_from_ultrastar_data,
)
from modules.Audio.silence_processing import remove_silence_from_transcribtion_data
from modules.Audio.silence_processing import remove_silence_from_transcription_data
from modules.csv_handler import export_transcribed_data_to_csv
from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3
from modules.Audio.youtube import (
Expand Down Expand Up @@ -83,6 +82,7 @@ def pitch_each_chunk_with_crepe(directory: str) -> list[str]:
f"{ULTRASINGER_HEAD} Pitching each chunk with {blue_highlighted('crepe')}"
)

device = "cpu" if settings.force_crepe_cpu else settings.device
midi_notes = []
for filename in sorted(
[f for f in os.listdir(directory) if f.endswith(".wav")],
Expand All @@ -91,7 +91,11 @@ def pitch_each_chunk_with_crepe(directory: str) -> list[str]:
filepath = os.path.join(directory, filename)
# todo: stepsize = duration? then when shorter than "it" it should take the duration. Otherwise there a more notes
pitched_data = get_pitch_with_crepe_file(
filepath, settings.crepe_step_size, settings.crepe_model_capacity
filepath,
settings.crepe_model_capacity,
device,
settings.crepe_batch_size,
settings.crepe_step_size,
)
conf_f = get_frequency_with_high_confidence(
pitched_data.frequencies, pitched_data.confidence
Expand Down Expand Up @@ -230,7 +234,6 @@ def hyphenate_each_word(language: str, transcribed_data: list[TranscribedData])
)
return None

sleep(0.1)
hyphenator = create_hyphenator(lang_region)
for i in tqdm(enumerate(transcribed_data)):
pos = i[0]
Expand Down Expand Up @@ -309,11 +312,14 @@ def run() -> None:

# Audio transcription
transcribed_data = None
language = None
language = settings.language
if is_audio:
language, transcribed_data = transcribe_audio()
detected_language, transcribed_data = transcribe_audio()
if language is None:
language = detected_language

remove_unecessary_punctuations(transcribed_data)
transcribed_data = remove_silence_from_transcribtion_data(
transcribed_data = remove_silence_from_transcription_data(
settings.mono_audio_path, transcribed_data
)

Expand Down Expand Up @@ -409,15 +415,14 @@ def transcribe_audio() -> (str, list[TranscribedData]):
"""Transcribe audio with AI"""
if settings.transcriber == "whisper":
device = "cpu" if settings.force_whisper_cpu else settings.device
transcribed_data, language = transcribe_with_whisper(
settings.mono_audio_path, settings.whisper_model, device, settings.whisper_align_model)
transcribed_data, detected_language = transcribe_with_whisper(
settings.mono_audio_path, settings.whisper_model, device, settings.whisper_align_model, settings.whisper_batch_size, settings.whisper_compute_type, settings.language)
else: # vosk
transcribed_data = transcribe_with_vosk(
settings.mono_audio_path, settings.vosk_model_path
)
# todo: make language selectable
language = "en"
return language, transcribed_data
detected_language = "en"
return detected_language, transcribed_data


def separate_vocal_from_audio(
Expand All @@ -427,16 +432,18 @@ def separate_vocal_from_audio(
audio_separation_path = os.path.join(
cache_path, "separated", "htdemucs", basename_without_ext
)
device = "cpu" if settings.force_separation_cpu else settings.device

if settings.use_separated_vocal or settings.create_karaoke:
device = "cpu" if settings.force_separation_cpu else settings.device
separate_audio(ultrastar_audio_input_path, cache_path, device)

if settings.use_separated_vocal:
vocals_path = os.path.join(audio_separation_path, "vocals.wav")
convert_audio_to_mono_wav(vocals_path, settings.mono_audio_path)
input_path = os.path.join(audio_separation_path, "vocals.wav")
else:
convert_audio_to_mono_wav(
ultrastar_audio_input_path, settings.mono_audio_path
)
input_path = ultrastar_audio_input_path


convert_audio_to_mono_wav(input_path, settings.mono_audio_path)
return audio_separation_path


Expand Down Expand Up @@ -695,10 +702,13 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast
"""Pitch audio"""
# todo: chunk pitching as option?
# midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name)
device = "cpu" if settings.force_crepe_cpu else settings.device
pitched_data = get_pitch_with_crepe_file(
settings.mono_audio_path,
settings.crepe_step_size,
settings.crepe_model_capacity,
device,
settings.crepe_batch_size,
settings.crepe_step_size,
)
if is_audio:
start_times = []
Expand Down Expand Up @@ -777,15 +787,27 @@ def init_settings(argv: list[str]) -> None:
elif opt in ("--whisper"):
settings.transcriber = "whisper"
settings.whisper_model = arg
elif opt in ("--align_model"):
elif opt in ("--whisper_align_model"):
settings.whisper_align_model = arg
elif opt in ("--whisper_batch_size"):
settings.whisper_batch_size = int(arg)
elif opt in ("--whisper_compute_type"):
settings.whisper_compute_type = arg
elif opt in ("--language"):
settings.language = arg
elif opt in ("--vosk"):
settings.transcriber = "vosk"
settings.vosk_model_path = arg
elif opt in ("--crepe"):
settings.crepe_model_capacity = arg
elif opt in ("--crepe_step_size"):
settings.crepe_step_size = int(arg)
elif opt in ("--crepe_batch_size"):
settings.crepe_batch_size = int(arg)
elif opt in ("--plot"):
settings.create_plot = arg
settings.create_plot = arg in ["True", "true"]
elif opt in ("--midi"):
settings.create_midi = arg in ["True", "true"]
elif opt in ("--hyphenation"):
settings.hyphenation = arg
elif opt in ("--disable_separation"):
Expand All @@ -798,6 +820,8 @@ def init_settings(argv: list[str]) -> None:
settings.force_whisper_cpu = arg
elif opt in ("--force_separation_cpu"):
settings.force_separation_cpu = arg
elif opt in ("--force_crepe_cpu"):
settings.force_crepe_cpu = arg
if settings.output_file_path == "":
if settings.input_file_path.startswith("https:"):
dirname = os.getcwd()
Expand All @@ -813,16 +837,23 @@ def arg_options():
"ifile=",
"ofile=",
"crepe=",
"crepe_step_size=",
"crepe_batch_size=",
"vosk=",
"whisper=",
"align_model=",
"whisper_align_model=",
"whisper_batch_size=",
"whisper_compute_type=",
"language=",
"plot=",
"midi=",
"hyphenation=",
"disable_separation=",
"disable_karaoke=",
"create_audio_chunks=",
"force_whisper_cpu=",
"force_separation_cpu="
"force_separation_cpu=",
"force_crepe_cpu=",
]
return long, short

Expand Down
Loading