diff --git a/LICENSE b/LICENSE index dfe8101..a346333 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License Copyright (c) 2023 Vadim Rangnau +Copyright (c) 2020 Max Morrison (torchcrepe code adapted for crepe output filtering abd thresholding) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 4f4567e..823e6ad 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,7 @@ _Not all options working now!_ --hyphenation True|False >> ((default) is True) --disable_separation True|False >> ((default) is False) --disable_karaoke True|False >> ((default) is False) + --ignore_audio True|False >> ((default) is False) --create_audio_chunks True|False >> ((default) is False) --keep_cache True|False >> ((default) is False) --plot True|False >> ((default) is False) diff --git a/pytest/modules/Pitcher/test_pitcher.py b/pytest/modules/Pitcher/test_pitcher.py index e623986..a028843 100644 --- a/pytest/modules/Pitcher/test_pitcher.py +++ b/pytest/modules/Pitcher/test_pitcher.py @@ -3,12 +3,20 @@ import os import unittest import src.modules.Pitcher.pitcher as test_subject + +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +from sklearn.cluster import KMeans +from sklearn import preprocessing as p +from sklearn.decomposition import PCA + import pytest from src.modules.plot import plot class PitcherTest(unittest.TestCase): - @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests") + # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests") def test_get_pitch_with_crepe_file(self): # Arrange test_dir = os.path.dirname(os.path.abspath(__file__)) @@ -21,7 +29,63 @@ def test_get_pitch_with_crepe_file(self): pitched_data = test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', device="cuda") # test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cpu', batch_size=1024) plot(pitched_data, test_output, title="pitching test") + + print("done") + + + # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests") + def test_pitch_clustering(self): + # Arrange + times = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1, 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, 1.2, 1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3, 1.31, 1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4, 1.41, 1.42, 1.43, 1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5, 1.51, 1.52, 1.53, 1.54, 1.55, 1.56, 1.57, 1.58, 1.59, 1.6, 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67, 1.68, 1.69, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79, 1.8, 1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9, 1.91, 1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2, 2.01, 2.02, 2.03, 2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 2.11, 2.12, 2.13, 2.14, 2.15, 2.16, 2.17, 2.18, 2.19, 2.2, 2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27, 2.28, 2.29, 2.3, 2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39, 2.4, 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5, 2.51, 2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6, 2.61, 2.62, 2.63, 2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7, 2.71, 2.72, 2.73, 2.74, 2.75, 2.76, 2.77, 2.78, 2.79, 2.8, 2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87, 2.88, 2.89, 2.9, 2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99, 3, 3.01, 3.02, 3.03, 3.04, 3.05, 3.06, 3.07, 3.08, 3.09, 3.1, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16, 3.17, 3.18, 3.19, 3.2, 3.21, 3.22, 3.23, 3.24, 3.25, 3.26, 3.27, 3.28, 3.29, 3.3, 3.31, 3.32, 3.33, 3.34, 3.35, 3.36, 3.37, 3.38, 3.39, 3.4, 3.41, 3.42, 3.43, 3.44, 3.45, 3.46, 3.47, 3.48, 3.49, 3.5, 3.51, 3.52, 3.53, 3.54, 3.55, 3.56, 3.57, 3.58, 3.59, 3.6, 3.61, 3.62, 3.63, 3.64, 3.65, 3.66, 3.67, 3.68, 3.69, 3.7, 3.71, 3.72, 3.73, 3.74, 3.75, 3.76, 3.77, 3.78, 3.79, 3.8, 3.81, 3.82, 3.83, 3.84, 3.85, 3.86, 3.87, 3.88, 3.89, 3.9, 3.91, 3.92, 3.93, 3.94, 3.95, 3.96, 3.97, 3.98, 3.99, 4, 4.01, 4.02, 4.03, 4.04, 4.05, 4.06, 4.07, 4.08, 4.09, 4.1, 4.11, 4.12, 4.13, 4.14, 4.15, 4.16, 4.17, 4.18, 4.19, 4.2, 4.21, 4.22, 4.23, 4.24, 4.25, 4.26, 4.27, 4.28, 4.29, 4.3, 4.31, 4.32, 4.33, 4.34, 4.35, 4.36, 4.37, 4.38, 4.39, 4.4, 4.41, 4.42, 4.43, 4.44, 4.45, 4.46, 4.47, 4.48, 4.49, 4.5, 4.51, 4.52, 4.53, 4.54, 4.55, 4.56, 4.57, 4.58, 4.59, 4.6, 4.61, 4.62, 4.63, 4.64, 4.65, 4.66, 4.67, 4.68, 4.69, 4.7, 4.71, 4.72, 4.73, 4.74, 4.75, 4.76, 4.77, 4.78, 4.79, 4.8, 4.81, 4.82, 4.83, 4.84, 4.85, 4.86, 4.87, 4.88, 4.89, 4.9, 4.91, 4.92, 4.93, 4.94, 4.95, 4.96, 4.97, 4.98, 4.99, 5, 5.01, 5.02, 5.03, 5.04, 5.05, 5.06, 5.07, 5.08, 5.09, 5.1, 5.11, 5.12, 5.13, 5.14, 5.15, 5.16, 5.17, 5.18, 5.19, 5.2, 5.21, 5.22, 5.23, 5.24, 5.25, 5.26, 5.27, 5.28, 5.29, 5.3, 5.31, 5.32, 5.33, 5.34, 5.35, 5.36, 5.37, 5.38, 5.39, 5.4, 5.41, 5.42, 5.43, 5.44, 5.45, 5.46, 5.47, 5.48, 5.49, 5.5, 5.51, 5.52, 5.53, 5.54, 5.55, 5.56, 5.57, 5.58, 5.59, 5.6, 5.61, 5.62, 5.63, 5.64, 5.65, 5.66, 5.67, 5.68, 5.69, 5.7, 5.71, 5.72, 5.73, 5.74, 5.75, 5.76, 5.77, 5.78, 5.79, 5.8, 5.81, 5.82, 5.83, 5.84, 5.85, 5.86, 5.87, 5.88, 5.89, 5.9, 5.91, 5.92, 5.93, 5.94, 5.95, 5.96, 5.97, 5.98, 5.99, 6, 6.01, 6.02, 6.03, 6.04, 6.05, 6.06, 6.07, 6.08, 6.09, 6.1, 6.11, 6.12, 6.13, 6.14, 6.15, 6.16, 6.17, 6.18, 6.19, 6.2, 6.21, 6.22, 6.23, 6.24, 6.25, 6.26, 6.27, 6.28, 6.29, 6.3, 6.31, 6.32, 6.33, 6.34, 6.35, 6.36, 6.37, 6.38, 6.39, 6.4, 6.41, 6.42, 6.43, 6.44, 6.45, 6.46, 6.47, 6.48, 6.49, 6.5, 6.51, 6.52, 6.53, 6.54, 6.55, 6.56, 6.57, 6.58, 6.59, 6.6, 6.61, 6.62, 6.63, 6.64, 6.65, 6.66, 6.67, 6.68, 6.69, 6.7, 6.71, 6.72, 6.73, 6.74, 6.75, 6.76, 6.77, 6.78, 6.79, 6.8, 6.81, 6.82, 6.83, 6.84, 6.85, 6.86, 6.87, 6.88, 6.89, 6.9, 6.91, 6.92, 6.93, 6.94, 6.95, 6.96, 6.97, 6.98, 6.99, 7, 7.01, 7.02, 7.03, 7.04, 7.05, 7.06, 7.07, 7.08, 7.09, 7.1, 7.11, 7.12, 7.13, 7.14, 7.15, 7.16, 7.17, 7.18, 7.19, 7.2, 7.21, 7.22, 7.23, 7.24, 7.25, 7.26, 7.27, 7.28, 7.29, 7.3, 7.31, 7.32, 7.33, 7.34, 7.35, 7.36, 7.37, 7.38, 7.39, 7.4, 7.41, 7.42, 7.43, 7.44, 7.45, 7.46, 7.47, 7.48, 7.49, 7.5, 7.51, 7.52, 7.53, 7.54, 7.55, 7.56, 7.57, 7.58, 7.59, 7.6, 7.61, 7.62, 7.63, 7.64, 7.65, 7.66, 7.67, 7.68, 7.69, 7.7, 7.71, 7.72, 7.73, 7.74, 7.75, 7.76, 7.77, 7.78, 7.79, 7.8, 7.81, 7.82, 7.83, 7.84, 7.85, 7.86, 7.87, 7.88, 7.89, 7.9, 7.91, 7.92, 7.93, 7.94, 7.95, 7.96, 7.97, 7.98, 7.99, 8, 8.01, 8.02, 8.03, 8.04, 8.05, 8.06, 8.07, 8.08, 8.09, 8.1, 8.11, 8.12, 8.13, 8.14, 8.15, 8.16, 8.17, 8.18, 8.19, 8.2, 8.21, 8.22, 8.23, 8.24, 8.25, 8.26, 8.27, 8.28, 8.29, 8.3, 8.31, 8.32, 8.33, 8.34, 8.35, 8.36, 8.37, 8.38, 8.39, 8.4, 8.41, 8.42, 8.43] + frequencies = [665.03, 659.52, 646.07, 572.62, 590.38, 649.3, 600.02, 624.6, 646.16, 650.34, 646.06, 651.35, 650.49, 589.08, 603.26, 625.12, 627.36, 636.09, 660.45, 659.91, 648.32, 657.78, 597.3, 595.76, 594.63, 659.18, 625.98, 645.65, 645.76, 650.01, 652.28, 653.14, 664.93, 662.59, 660.76, 642.67, 644.79, 649.94, 625.44, 627.31, 645.4, 645.22, 652.12, 598.35, 623.99, 644.58, 645.09, 650.61, 650.01, 598.85, 656.88, 636.74, 652.16, 650.66, 649.87, 657.12, 625.48, 634.35, 661.24, 651.33, 655.77, 658.05, 661.75, 660.07, 661.37, 662.0, 662.29, 662.33, 664.63, 661.54, 661.31, 630.51, 590.5, 658.83, 625.16, 635.46, 661.08, 659.04, 659.97, 666.0, 670.29, 666.15, 658.62, 662.12, 662.78, 656.71, 662.49, 661.36, 641.94, 647.05, 652.86, 598.11, 599.27, 656.18, 625.69, 659.14, 657.94, 659.05, 657.8, 656.69, 653.63, 638.82, 631.24, 625.38, 635.76, 658.93, 657.88, 660.1, 660.92, 613.83, 593.21, 608.63, 624.15, 633.75, 659.97, 658.17, 657.64, 590.49, 651.47, 656.66, 657.47, 637.08, 658.92, 659.45, 590.42, 592.88, 592.52, 596.55, 657.24, 660.3, 634.03, 635.35, 647.72, 648.55, 648.52, 647.42, 645.35, 648.98, 653.16, 650.04, 656.79, 644.61, 646.96, 648.74, 646.2, 642.01, 632.67, 698.0, 660.9, 635.71, 637.32, 646.88, 645.85, 644.92, 647.01, 646.71, 645.62, 645.52, 653.7, 660.06, 626.52, 646.6, 652.7, 653.21, 597.68, 658.95, 660.67, 636.1, 657.82, 659.44, 653.16, 652.2, 657.95, 660.01, 627.14, 636.14, 644.73, 649.03, 651.96, 662.32, 675.07, 679.81, 683.4, 692.69, 699.38, 695.89, 697.21, 698.53, 700.52, 704.75, 707.07, 691.78, 682.75, 677.22, 669.76, 660.38, 656.93, 645.32, 643.54, 647.62, 592.21, 658.68, 658.62, 660.63, 634.49, 642.43, 654.18, 659.81, 671.21, 678.8, 684.99, 690.6, 648.73, 598.17, 658.13, 659.56, 690.23, 642.85, 647.82, 648.53, 650.73, 647.39, 645.03, 647.35, 650.27, 649.18, 647.71, 646.93, 646.93, 640.58, 636.02, 632.72, 626.74, 619.02, 612.84, 597.52, 632.17, 663.0, 690.65, 643.34, 653.35, 647.88, 653.15, 652.6, 658.93, 658.99, 659.11, 658.06, 659.11, 660.6, 641.79, 610.65, 598.81, 659.95, 635.14, 660.15, 662.85, 651.13, 635.97, 658.73, 658.91, 659.11, 690.8, 695.01, 683.24, 681.22, 674.65, 663.59, 659.06, 649.47, 640.65, 632.87, 624.67, 645.13, 661.67, 646.04, 647.7, 651.95, 652.31, 659.4, 692.03, 635.43, 628.9, 624.13, 604.23, 597.85, 624.24, 624.9, 645.21, 645.66, 646.12, 649.43, 598.26, 657.7, 635.79, 652.29, 657.12, 659.24, 654.06, 646.01, 645.04, 584.87, 532.52, 537.38, 470.4, 419.16, 376.77, 331.49, 300.82, 266.99, 235.5, 208.51, 187.03, 166.8, 145.7, 128.59, 114.6, 111.88, 111.34, 111.4, 112.1, 112.58, 112.77, 112.31, 111.91, 111.61, 111.35, 111.24, 111.36, 111.71, 112.28, 112.58, 112.6, 112.72, 112.64, 112.59, 112.58, 112.61, 112.7, 112.64, 112.4, 112.35, 112.19, 112.12, 112.28, 112.71, 113.53, 114.07, 114.72, 115.76, 116.9, 118.35, 120.27, 122.07, 123.86, 125.5, 127.17, 128.75, 130.15, 130.75, 131.19, 131.61, 132.12, 132.39, 132.65, 133.09, 133.35, 133.27, 133.47, 133.67, 133.72, 134.03, 134.05, 133.88, 133.98, 133.74, 133.51, 133.39, 133.18, 132.97, 132.75, 132.42, 132.22, 132.33, 132.47, 132.95, 133.73, 134.7, 136.04, 137.57, 138.72, 140.38, 142.22, 144.34, 146.13, 147.59, 149.4, 151.91, 154.03, 155.77, 157.2, 158.19, 158.3, 157.86, 157.38, 156.9, 156.33, 155.93, 155.5, 155.25, 155.1, 154.86, 154.75, 154.79, 154.81, 154.89, 154.93, 154.99, 155.17, 155.21, 155.27, 155.36, 155.26, 155.27, 155.32, 155.59, 155.95, 157.1, 159.24, 161.65, 163.48, 165.11, 166.87, 169.52, 172.43, 174.91, 177.59, 180.13, 182.91, 185.0, 186.43, 187.73, 188.14, 188.07, 187.49, 186.78, 186.54, 186.58, 186.74, 186.89, 186.78, 186.7, 186.94, 187.54, 188.14, 188.32, 188.13, 187.78, 187.68, 187.78, 187.92, 188.01, 187.96, 188.35, 189.13, 190.21, 190.87, 191.13, 190.8, 190.31, 189.41, 188.83, 188.26, 187.96, 187.35, 186.75, 186.32, 185.88, 185.66, 185.5, 185.51, 185.99, 186.39, 187.09, 187.52, 187.75, 187.83, 188.01, 188.66, 189.73, 190.58, 190.67, 190.03, 189.38, 188.8, 188.28, 188.16, 188.06, 187.99, 187.94, 187.91, 188.05, 188.52, 189.14, 189.76, 190.26, 190.42, 190.45, 190.43, 190.29, 190.09, 189.81, 189.93, 189.84, 189.58, 189.03, 188.63, 188.51, 188.85, 189.62, 190.32, 190.56, 190.57, 190.23, 189.8, 189.29, 189.1, 188.58, 187.76, 185.73, 183.73, 179.79, 174.77, 167.02, 164.36, 163.86, 164.43, 165.23, 166.1, 166.97, 167.8, 168.58, 169.3, 169.57, 169.86, 169.71, 169.03, 168.81, 168.58, 168.81, 169.13, 169.9, 170.26, 170.77, 171.23, 171.33, 171.52, 171.68, 171.48, 170.94, 170.4, 169.77, 169.56, 169.39, 169.32, 169.49, 169.51, 169.93, 170.61, 171.28, 171.92, 172.58, 172.45, 172.17, 172.0, 171.76, 171.48, 171.1, 170.78, 170.49, 170.16, 170.08, 170.66, 171.31, 172.1, 172.42, 172.75, 173.04, 173.32, 173.54, 173.76, 173.83, 173.66, 173.12, 172.18, 170.25, 166.95, 164.22, 159.63, 153.98, 149.11, 147.08, 147.28, 147.78, 148.69, 149.42, 149.85, 150.08, 150.11, 150.21, 150.23, 150.26, 150.09, 149.83, 149.6, 149.44, 149.46, 149.45, 149.46, 149.54, 149.83, 150.41, 151.16, 152.16, 152.9, 153.67, 154.13, 154.6, 154.88, 155.06, 155.09, 154.94, 154.86, 154.6, 154.1, 153.51, 152.86, 152.64, 152.4, 152.49, 152.71, 152.89, 153.26, 153.54, 154.2, 154.68, 155.2, 155.77, 156.31, 156.93, 157.24, 157.01, 157.61, 156.31, 170.0, 193.15, 209.91, 234.64, 255.53, 284.74, 306.29, 333.92, 377.15, 401.52, 448.67, 492.14, 535.32, 591.44, 589.15, 595.15, 607.96, 625.84, 636.75, 646.5, 644.68, 652.56, 658.79, 649.39, 631.42, 633.55, 645.17, 644.46, 649.97, 581.36, 581.38, 590.79, 601.73, 592.23, 591.62, 594.09, 633.63, 633.25, 627.58, 658.55, 663.0, 662.13, 648.45, 640.45, 634.05, 645.37, 659.85, 662.4, 667.63, 668.55, 662.2, 662.92, 661.72, 657.75, 653.06, 640.97, 628.55, 626.07, 629.33, 628.84, 635.41, 635.19, 645.97, 650.58, 655.78, 657.31, 657.48, 646.94, 645.51, 651.18, 655.65, 627.13, 647.02, 652.56, 651.73, 643.98, 649.71, 659.13, 689.11, 672.49, 653.14, 646.99, 647.11, 599.3, 624.89, 626.45, 634.17, 646.55, 653.63, 658.17, 662.79, 683.29, 691.02, 635.07, 615.7, 598.63, 616.46, 633.13, 658.27, 643.39, 647.15, 650.84, 656.71, 625.19, 646.25, 657.19, 647.88, 634.7, 636.86, 646.4, 649.72, 596.76, 597.36, 656.17, 626.22, 644.55, 644.42, 651.81, 625.18, 626.33, 634.01, 644.88, 651.92, 595.9, 652.5, 623.37, 656.65, 646.16, 645.91, 651.86, 596.94, 656.35, 658.52, 635.42, 652.11, 660.49, 590.83, 603.91, 613.4, 632.95, 646.0, 658.46, 646.69, 647.74, 590.35, 591.16, 650.7, 596.67, 657.69, 660.8, 689.3, 636.49, 658.31, 647.31, 645.17, 645.15, 649.0, 657.39, 636.03, 647.63, 657.81, 591.34, 596.66, 656.91, 658.68, 641.36, 648.52, 659.13, 590.37, 591.02, 650.28, 656.05, 624.6, 657.67, 652.16, 650.86, 650.79, 657.52, 634.04, 641.58, 645.91, 658.51, 625.29, 634.09, 645.1, 642.77, 634.01, 626.52, 645.07, 650.76101509] + frequencies_log_10 = [freq * 10 for freq in np.log10(frequencies)] + confidence = [0.04, 0.044, 0.109, 0.033, 0.094, 0.078, 0.085, 0.093, 0.099, 0.125, 0.156, 0.168, 0.094, 0.153, 0.063, 0.06, 0.095, 0.119, 0.121, 0.04, 0.098, 0.102, 0.076, 0.089, 0.076, 0.058, 0.075, 0.089, 0.139, 0.157, 0.144, 0.095, 0.032, 0.041, 0.094, 0.124, 0.112, 0.103, 0.104, 0.113, 0.096, 0.177, 0.149, 0.086, 0.079, 0.088, 0.134, 0.111, 0.071, 0.082, 0.097, 0.109, 0.149, 0.142, 0.154, 0.132, 0.117, 0.071, 0.071, 0.10, 0.098, 0.106, 0.087, 0.103, 0.067, 0.069, 0.078, 0.094, 0.303, 0.365, 0.056, 0.014, 0.037, 0.068, 0.106, 0.097, 0.09, 0.092, 0.034, 0.078, 0.028, 0.037, 0.016, 0.009, 0.042, 0.042, 0.041, 0.06, 0.115, 0.151, 0.132, 0.103, 0.092, 0.094, 0.08, 0.106, 0.138, 0.083, 0.077, 0.233, 0.273, 0.074, 0.073, 0.106, 0.103, 0.117, 0.081, 0.084, 0.051, 0.08, 0.036, 0.027, 0.047, 0.108, 0.085, 0.117, 0.099, 0.085, 0.084, 0.092, 0.105, 0.104, 0.086, 0.14, 0.083, 0.04, 0.083, 0.057, 0.08, 0.083, 0.058, 0.106, 0.089, 0.095, 0.046, 0.034, 0.039, 0.138, 0.23, 0.628, 0.397, 0.106, 0.036, 0.034, 0.039, 0.048, 0.056, 0.067, 0.066, 0.047, 0.022, 0.078, 0.04, 0.026, 0.028, 0.069, 0.061, 0.105, 0.056, 0.056, 0.085, 0.097, 0.093, 0.093, 0.075, 0.061, 0.066, 0.10, 0.102, 0.147, 0.093, 0.083, 0.088, 0.083, 0.069, 0.051, 0.04, 0.05, 0.133, 0.075, 0.051, 0.051, 0.108, 0.229, 0.038, 0.03, 0.052, 0.043, 0.068, 0.056, 0.081, 0.131, 0.104, 0.072, 0.056, 0.098, 0.025, 0.047, 0.074, 0.063, 0.068, 0.067, 0.072, 0.084, 0.085, 0.105, 0.051, 0.047, 0.04, 0.116, 0.038, 0.073, 0.037, 0.072, 0.087, 0.083, 0.08, 0.081, 0.075, 0.057, 0.076, 0.046, 0.038, 0.015, 0.03, 0.259, 0.571, 0.455, 0.101, 0.131, 0.033, 0.08, 0.069, 0.094, 0.111, 0.046, 0.037, 0.042, 0.046, 0.083, 0.108, 0.129, 0.101, 0.083, 0.062, 0.091, 0.095, 0.105, 0.123, 0.046, 0.046, 0.045, 0.048, 0.075, 0.07, 0.064, 0.032, 0.045, 0.069, 0.058, 0.091, 0.096, 0.051, 0.028, 0.04, 0.021, 0.048, 0.067, 0.102, 0.071, 0.019, 0.041, 0.048, 0.084, 0.075, 0.108, 0.122, 0.108, 0.087, 0.088, 0.051, 0.079, 0.042, 0.016, 0.028, 0.09, 0.105, 0.084, 0.073, 0.122, 0.125, 0.073, 0.07, 0.099, 0.094, 0.137, 0.112, 0.068, 0.023, 0.069, 0.087, 0.04, 0.079, 0.026, 0.092, 0.027, 0.05, 0.12, 0.112, 0.094, 0.063, 0.041, 0.129, 0.758, 0.529, 0.106, 0.117, 0.432, 0.798, 0.908, 0.898, 0.879, 0.889, 0.89, 0.919, 0.926, 0.923, 0.918, 0.921, 0.916, 0.893, 0.891, 0.893, 0.899, 0.901, 0.904, 0.897, 0.891, 0.895, 0.892, 0.887, 0.893, 0.909, 0.916, 0.902, 0.889, 0.889, 0.918, 0.911, 0.937, 0.936, 0.912, 0.895, 0.915, 0.926, 0.891, 0.884, 0.893, 0.90, 0.934, 0.943, 0.942, 0.925, 0.924, 0.936, 0.945, 0.931, 0.937, 0.931, 0.938, 0.94, 0.952, 0.949, 0.942, 0.95, 0.941, 0.929, 0.936, 0.937, 0.945, 0.95, 0.932, 0.927, 0.938, 0.935, 0.945, 0.945, 0.94, 0.902, 0.915, 0.912, 0.88, 0.912, 0.915, 0.953, 0.959, 0.933, 0.922, 0.939, 0.955, 0.937, 0.959, 0.956, 0.961, 0.953, 0.938, 0.961, 0.967, 0.959, 0.95, 0.95, 0.951, 0.965, 0.958, 0.958, 0.96, 0.955, 0.955, 0.948, 0.95, 0.951, 0.957, 0.948, 0.956, 0.952, 0.962, 0.966, 0.927, 0.928, 0.936, 0.953, 0.966, 0.942, 0.897, 0.911, 0.923, 0.931, 0.921, 0.935, 0.953, 0.923, 0.932, 0.924, 0.927, 0.94, 0.924, 0.935, 0.929, 0.921, 0.93, 0.922, 0.918, 0.931, 0.94, 0.928, 0.92, 0.919, 0.938, 0.938, 0.935, 0.932, 0.933, 0.932, 0.92, 0.915, 0.928, 0.911, 0.901, 0.916, 0.931, 0.924, 0.903, 0.919, 0.933, 0.939, 0.918, 0.94, 0.953, 0.958, 0.963, 0.964, 0.955, 0.928, 0.93, 0.936, 0.933, 0.935, 0.932, 0.895, 0.929, 0.916, 0.91, 0.93, 0.917, 0.894, 0.918, 0.926, 0.928, 0.933, 0.935, 0.927, 0.929, 0.906, 0.916, 0.924, 0.926, 0.925, 0.917, 0.92, 0.924, 0.928, 0.931, 0.928, 0.93, 0.921, 0.905, 0.90, 0.905, 0.896, 0.921, 0.917, 0.912, 0.909, 0.924, 0.92, 0.916, 0.912, 0.898, 0.928, 0.944, 0.934, 0.815, 0.639, 0.781, 0.924, 0.956, 0.952, 0.976, 0.961, 0.955, 0.949, 0.909, 0.927, 0.935, 0.92, 0.906, 0.915, 0.916, 0.912, 0.927, 0.932, 0.915, 0.90, 0.918, 0.932, 0.941, 0.932, 0.926, 0.934, 0.939, 0.899, 0.906, 0.924, 0.929, 0.927, 0.928, 0.918, 0.901, 0.90, 0.937, 0.931, 0.95, 0.951, 0.934, 0.917, 0.924, 0.942, 0.937, 0.923, 0.918, 0.912, 0.909, 0.912, 0.926, 0.925, 0.943, 0.954, 0.958, 0.958, 0.956, 0.953, 0.953, 0.956, 0.96, 0.925, 0.861, 0.893, 0.864, 0.779, 0.709, 0.846, 0.944, 0.963, 0.957, 0.931, 0.939, 0.93, 0.926, 0.929, 0.929, 0.932, 0.931, 0.927, 0.925, 0.937, 0.936, 0.941, 0.935, 0.94, 0.932, 0.925, 0.931, 0.931, 0.938, 0.944, 0.939, 0.955, 0.958, 0.951, 0.951, 0.946, 0.953, 0.957, 0.96, 0.951, 0.931, 0.941, 0.944, 0.941, 0.942, 0.946, 0.94, 0.936, 0.93, 0.954, 0.954, 0.943, 0.954, 0.938, 0.876, 0.728, 0.592, 0.365, 0.058, 0.025, 0.043, 0.031, 0.041, 0.029, 0.025, 0.012, 0.009, 0.018, 0.029, 0.033, 0.026, 0.018, 0.04, 0.069, 0.045, 0.078, 0.01, 0.064, 0.327, 0.034, 0.012, 0.091, 0.08, 0.082, 0.09, 0.117, 0.148, 0.123, 0.315, 0.07, 0.005, 0.008, 0.039, 0.13, 0.084, 0.081, 0.14, 0.102, 0.053, 0.043, 0.163, 0.067, 0.037, 0.04, 0.058, 0.029, 0.034, 0.03, 0.025, 0.261, 0.122, 0.047, 0.081, 0.047, 0.184, 0.10, 0.177, 0.092, 0.052, 0.032, 0.039, 0.127, 0.084, 0.072, 0.09, 0.125, 0.127, 0.128, 0.118, 0.097, 0.124, 0.114, 0.148, 0.124, 0.081, 0.095, 0.097, 0.11, 0.074, 0.14, 0.117, 0.089, 0.09, 0.089, 0.101, 0.122, 0.128, 0.123, 0.094, 0.062, 0.038, 0.035, 0.064, 0.032, 0.033, 0.045, 0.074, 0.109, 0.126, 0.112, 0.127, 0.093, 0.093, 0.059, 0.091, 0.026, 0.053, 0.10, 0.132, 0.115, 0.065, 0.071, 0.063, 0.081, 0.126, 0.143, 0.125, 0.093, 0.112, 0.102, 0.17, 0.128, 0.077, 0.087, 0.069, 0.061, 0.102, 0.139, 0.143, 0.07, 0.076, 0.124, 0.114, 0.14, 0.099, 0.033, 0.062, 0.056, 0.091, 0.041, 0.112, 0.116, 0.145, 0.077, 0.08, 0.073, 0.074, 0.09, 0.091, 0.105, 0.045, 0.034, 0.164, 0.131, 0.102, 0.082, 0.115, 0.085, 0.149, 0.081, 0.04, 0.064, 0.103, 0.135, 0.09, 0.13, 0.109, 0.071, 0.082, 0.08, 0.088, 0.099, 0.105, 0.133, 0.136, 0.133, 0.117, 0.098, 0.023, 0.105, 0.089, 0.102, 0.029, 0.097, 0.034, 0.076, 0.095, 0.103, 0.15038174] + + matrix = [[times[i], frequencies_log_10[i], confidence[i]] for i, _ in enumerate(times)] + # Act + df = pd.DataFrame(matrix) + df.columns = ['time', 'log 10 frequency', 'confidence'] + df_ss = pd.DataFrame(p.minmax_scale(df)) + df_ss.columns = ['time', 'log 10 frequency', 'confidence'] + + # apply custom weight to frequency + df_ss['log 10 frequency'] = df_ss['log 10 frequency'] / 2 + + clusters = 20 + labels = fit_kmeans(df_ss, clusters) + figure, axis = plt.subplots(2, 2) + axis[0][0].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5) + axis[0][0].set_title("Ratio 1:1:1") + + # apply custom weight to frequency + df_ss['log 10 frequency'] = df['log 10 frequency'] / 5 + labels = fit_kmeans(df_ss, clusters) + axis[1][0].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5) + axis[1][0].set_title("Ratio 1:5:1") + + # apply custom weight to frequency + df_ss['confidence'] = df['confidence'] / 100 + labels = fit_kmeans(df_ss, clusters) + axis[0][1].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5) + axis[0][1].set_title("Ratio 1:1:100") + + # apply custom weight to frequency + df_ss['time'] = df['time'] / 100 + labels = fit_kmeans(df_ss, clusters) + axis[1][1].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5) + axis[1][1].set_title("Ratio 100:1:1") + + figure.set_figwidth(12.8) + plt.show() print("done") + +def fit_kmeans(data, centers): + kmeans = KMeans(centers) + labels = kmeans.fit_predict(data) + return labels + + + if __name__ == "__main__": unittest.main() diff --git a/pytest/modules/Speech_Recognition/test_Whisper.py b/pytest/modules/Speech_Recognition/test_Whisper.py index 8f78701..cbb0d07 100644 --- a/pytest/modules/Speech_Recognition/test_Whisper.py +++ b/pytest/modules/Speech_Recognition/test_Whisper.py @@ -30,14 +30,13 @@ def test_convert_to_transcribed_data(self): # Words should have space at the end expected_output = [ - TranscribedData( - {"word": "UltraSinger ", "start": 1.23, "end": 2.34, "is_hyphen": None, "confidence": 0.95}), - TranscribedData({"word": "is ", "start": 2.34, "end": 3.45, "is_hyphen": None, "confidence": 0.9}), - TranscribedData({"word": "cool! ", "start": 3.45, "end": 4.56, "is_hyphen": None, "confidence": 0.85}), - TranscribedData({"word": "And ", "start": 4.56, "end": 5.67, "is_hyphen": None, "confidence": 0.95}), - TranscribedData({"word": "will ", "start": 5.67, "end": 6.78, "is_hyphen": None, "confidence": 0.9}), - TranscribedData({"word": "be ", "start": 6.78, "end": 7.89, "is_hyphen": None, "confidence": 0.85}), - TranscribedData({"word": "better! ", "start": 7.89, "end": 9.01, "is_hyphen": None, "confidence": 0.8}), + TranscribedData.from_dict({"word": "UltraSinger ", "start": 1.23, "end": 2.34, "is_hyphen": None, "confidence": 0.95}), + TranscribedData.from_dict({"word": "is ", "start": 2.34, "end": 3.45, "is_hyphen": None, "confidence": 0.9}), + TranscribedData.from_dict({"word": "cool! ", "start": 3.45, "end": 4.56, "is_hyphen": None, "confidence": 0.85}), + TranscribedData.from_dict({"word": "And ", "start": 4.56, "end": 5.67, "is_hyphen": None, "confidence": 0.95}), + TranscribedData.from_dict({"word": "will ", "start": 5.67, "end": 6.78, "is_hyphen": None, "confidence": 0.9}), + TranscribedData.from_dict({"word": "be ", "start": 6.78, "end": 7.89, "is_hyphen": None, "confidence": 0.85}), + TranscribedData.from_dict({"word": "better! ", "start": 7.89, "end": 9.01, "is_hyphen": None, "confidence": 0.8}), ] # Act diff --git a/pytest/modules/UltraStar/test_ultrastar_writer.py b/pytest/modules/UltraStar/test_ultrastar_writer.py index 514da00..4877f83 100644 --- a/pytest/modules/UltraStar/test_ultrastar_writer.py +++ b/pytest/modules/UltraStar/test_ultrastar_writer.py @@ -68,20 +68,20 @@ def test_create_ultrastar_txt_from_automation_full_values(self): def arrange(self): # Arrange transcribed_data = [ - TranscribedData({ - "conf": 0.95, + TranscribedData.from_dict({ + "confidence": 0.95, "word": "UltraSinger ", "end": 2.5, "start": 0.5 }), - TranscribedData({ - "conf": 0.9, + TranscribedData.from_dict({ + "confidence": 0.9, "word": "is ", "end": 4.5, "start": 3.0 }), - TranscribedData({ - "conf": 0.85, + TranscribedData.from_dict({ + "confidence": 0.85, "word": "cool! ", "end": 7.5, "start": 5.5 @@ -110,20 +110,27 @@ def default_values(default_ultrastar_class, ver): expected_calls = [] if version.parse(ver) >= version.parse("1.0.0"): expected_calls.append(f"#{UltrastarTxtTag.VERSION}:{default_ultrastar_class.version}\n") - expected_calls.append(f"#{UltrastarTxtTag.ARTIST}:{default_ultrastar_class.artist}\n") - expected_calls.append(f"#{UltrastarTxtTag.TITLE}:{default_ultrastar_class.title}\n") - expected_calls.append(f"#{UltrastarTxtTag.MP3}:{default_ultrastar_class.mp3}\n") + expected_calls += [ + f"#{UltrastarTxtTag.ARTIST}:{default_ultrastar_class.artist}\n", + f"#{UltrastarTxtTag.TITLE}:{default_ultrastar_class.title}\n", + f"#{UltrastarTxtTag.MP3}:{default_ultrastar_class.mp3}\n" + ] if version.parse(ver) >= version.parse("1.1.0"): - expected_calls.append(f"#{UltrastarTxtTag.AUDIO}:{default_ultrastar_class.audio}\n") - expected_calls.append(f"#{UltrastarTxtTag.VIDEO}:{default_ultrastar_class.video}\n") # todo: video is optional - expected_calls.append(f"#{UltrastarTxtTag.BPM}:390.0\n") - expected_calls.append(f"#{UltrastarTxtTag.GAP}:500\n") - expected_calls.append(f"#{UltrastarTxtTag.CREATOR}:{default_ultrastar_class.creator}\n") - expected_calls.append(f"#{UltrastarTxtTag.COMMENT}:{default_ultrastar_class.comment}\n") - expected_calls.append(": 0 52 1 UltraSinger \n") - expected_calls.append(": 65 39 2 is \n") - expected_calls.append(": 130 52 3 cool! \n") - expected_calls.append("E") + expected_calls += [f"#{UltrastarTxtTag.AUDIO}:{default_ultrastar_class.audio}\n"] + if default_ultrastar_class.video is not None: + expected_calls += [ + f"#{UltrastarTxtTag.VIDEO}:{default_ultrastar_class.video}\n", + ] + expected_calls += [ + f"#{UltrastarTxtTag.BPM}:390.0\n", + f"#{UltrastarTxtTag.GAP}:500\n", + f"#{UltrastarTxtTag.CREATOR}:{default_ultrastar_class.creator}\n", + f"#{UltrastarTxtTag.COMMENT}:{default_ultrastar_class.comment}\n", + ": 0 52 1 UltraSinger \n", + ": 65 39 2 is \n", + ": 130 52 3 cool! \n", + "E" + ] return expected_calls diff --git a/requirements.txt b/requirements.txt index 8477866..1f39d8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ pydub~=0.25.1 PyHyphen~=4.0.3 python_Levenshtein~=0.25.1 scipy~=1.13.1 -tensorflow<2.11 +tensorflow==2.10.1 tqdm~=4.66.4 #whisperx~=3.1.1 yt_dlp~=2024.5.27 diff --git a/src/Settings.py b/src/Settings.py index 6f88672..ee928f4 100644 --- a/src/Settings.py +++ b/src/Settings.py @@ -1,3 +1,10 @@ +from dataclasses import dataclass + +from dataclasses_json import dataclass_json + + +@dataclass_json +@dataclass class Settings: APP_VERSION = "0.0.11-dev4" @@ -8,6 +15,8 @@ class Settings: use_separated_vocal = True create_karaoke = True keep_cache = False + ignore_audio = False + input_file_is_ultrastar_txt = False input_file_path = "" output_file_path = "" @@ -30,6 +39,7 @@ class Settings: # Pitch crepe_model_capacity = "full" # tiny|small|medium|large|full crepe_step_size = 10 # in miliseconds + pitch_loudness_threshold = -60 # Device pytorch_device = 'cpu' # cpu|cuda @@ -37,3 +47,12 @@ class Settings: force_cpu = False force_whisper_cpu = False force_crepe_cpu = False + + # UltraSinger Evaluation Configuration + test_songs_input_folder = None + cache_override_path = None + skip_cache_vocal_separation = False + skip_cache_denoise_vocal_audio = False + skip_cache_transcription = False + skip_cache_pitch_detection = False + calculate_score = True \ No newline at end of file diff --git a/src/UltraSinger.py b/src/UltraSinger.py index 3d70dcc..50db73f 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -4,6 +4,7 @@ import getopt import os import sys +from typing import Tuple, Any import re import Levenshtein @@ -15,6 +16,7 @@ import soundfile as sf from modules import os_helper +from modules.os_helper import check_file_exists from modules.Audio.denoise import ffmpeg_reduce_noise from modules.Audio.separation import separate_audio from modules.Audio.vocal_chunks import ( @@ -22,6 +24,8 @@ export_chunks_from_ultrastar_data, ) from modules.Audio.silence_processing import remove_silence_from_transcription_data, get_silence_sections +from modules.Speech_Recognition.TranscriptionResult import TranscriptionResult +from modules.Ultrastar.ultrastar_score_calculator import Score from modules.csv_handler import export_transcribed_data_to_csv from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3 from modules.Audio.youtube import ( @@ -37,6 +41,7 @@ gold_highlighted, light_blue_highlighted, red_highlighted, + green_highlighted, ) from modules.Midi.midi_creator import ( convert_frequencies_to_notes, @@ -52,23 +57,32 @@ get_pitch_with_crepe_file, ) from modules.Pitcher.pitched_data import PitchedData -from modules.Speech_Recognition.hyphenation import hyphenation, language_check, create_hyphenator +from modules.Speech_Recognition.hyphenation import ( + hyphenation, + language_check, + create_hyphenator, +) from modules.Speech_Recognition.Whisper import transcribe_with_whisper -from modules.Ultrastar import ultrastar_score_calculator, ultrastar_writer, ultrastar_converter, ultrastar_parser -from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue +from modules.Ultrastar import ( + ultrastar_score_calculator, + ultrastar_writer, + ultrastar_converter, + ultrastar_parser, +) +from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue, FILE_ENCODING from Settings import Settings from modules.Speech_Recognition.TranscribedData import TranscribedData from modules.plot import plot, plot_spectrogram from modules.musicbrainz_client import get_music_infos settings = Settings() +SYLLABLE_SEGMENT_SIZE = 0.1 +SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE = 0.1 def pitch_each_chunk_with_crepe(directory: str) -> list[str]: """Pitch each chunk with crepe and return midi notes""" - print( - f"{ULTRASINGER_HEAD} Pitching each chunk with {blue_highlighted('crepe')}" - ) + print(f"{ULTRASINGER_HEAD} Pitching each chunk with {blue_highlighted('crepe')}") midi_notes = [] for filename in sorted( @@ -97,7 +111,9 @@ def pitch_each_chunk_with_crepe(directory: str) -> list[str]: return midi_notes -def add_hyphen_to_data(transcribed_data: list[TranscribedData], hyphen_words: list[list[str]]): +def add_hyphen_to_data( + transcribed_data: list[TranscribedData], hyphen_words: list[list[str]] +): """Add hyphen to transcribed data return new data list""" new_data = [] @@ -133,9 +149,7 @@ def get_bpm_from_data(data, sampling_rate): onset_env = librosa.onset.onset_strength(y=data, sr=sampling_rate) wav_tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sampling_rate) - print( - f"{ULTRASINGER_HEAD} BPM is {blue_highlighted(str(round(wav_tempo[0], 2)))}" - ) + print(f"{ULTRASINGER_HEAD} BPM is {blue_highlighted(str(round(wav_tempo[0], 2)))}") return wav_tempo[0] @@ -226,12 +240,12 @@ def remove_unecessary_punctuations(transcribed_data: list[TranscribedData]) -> N """Remove unecessary punctuations from transcribed data""" punctuation = ".," for i, data in enumerate(transcribed_data): - data.word = data.word.translate( - {ord(i): None for i in punctuation} - ) + data.word = data.word.translate({ord(i): None for i in punctuation}) -def hyphenate_each_word(language: str, transcribed_data: list[TranscribedData]) -> list[list[str]] | None: +def hyphenate_each_word( + language: str, transcribed_data: list[TranscribedData] +) -> list[list[str]] | None: """Hyphenate each word in the transcribed data.""" lang_region = language_check(language) if lang_region is None: @@ -281,17 +295,90 @@ def print_version() -> None: f"{ULTRASINGER_HEAD} {gold_highlighted('*****************************')}" ) -def run() -> None: +def split_syllables_into_segments( + transcribed_data: list[TranscribedData], +) -> list[TranscribedData]: + """Split every syllable into sub-segments""" + segment_size_decimal_points = len(str(SYLLABLE_SEGMENT_SIZE).split(".")[1]) + new_data = [] + + for i, data in enumerate(transcribed_data): + duration = data.end - data.start + if duration <= SYLLABLE_SEGMENT_SIZE: + new_data.append(data) + continue + + has_space = str(data.word).endswith(" ") + first_segment = copy.deepcopy(data) + filler_words_start = data.start + SYLLABLE_SEGMENT_SIZE + remainder = data.end - (filler_words_start) + first_segment.end = filler_words_start + if has_space: + first_segment.word = first_segment.word[:-1] + + new_data.append(first_segment) + + full_segments, partial_segment = divmod(remainder, SYLLABLE_SEGMENT_SIZE) + + if full_segments >= 1: + for i in range(int(full_segments)): + segment = TranscribedData() + segment.word = "~" + segment.start = filler_words_start + round( + i * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points + ) + segment.end = segment.start + SYLLABLE_SEGMENT_SIZE + new_data.append(segment) + + if partial_segment >= 0.01: + segment = TranscribedData() + segment.word = "~" + segment.start = filler_words_start + round( + full_segments * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points + ) + segment.end = segment.start + partial_segment + new_data.append(segment) + + if has_space: + new_data[-1].word += " " + return new_data + + +def merge_syllable_segments( + transcribed_data: list[TranscribedData], midi_segments: list[MidiSegment], us_notes=list[int] +) -> tuple[list[TranscribedData], list[str], list[int]]: + """Merge sub-segments of a syllable where the pitch is the same""" + new_data = [] + new_midi_notes = [] + new_us_notes = [] + + previous_data = None + + for i, data in enumerate(transcribed_data): + if ( + str(data.word).startswith("~") + and previous_data is not None + and midi_segments[i].note == midi_segments[i - 1].note + and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE + ): + new_data[-1].end = data.end + else: + new_data.append(data) + new_midi_notes.append(midi_segments[i].note) + new_us_notes.append(us_notes[i]) + previous_data = data + return new_data, new_midi_notes, new_us_notes + + +def run() -> tuple[str, Score, Score]: """The processing function of this program""" - is_audio = ".txt" not in settings.input_file_path + settings.input_file_is_ultrastar_txt = settings.input_file_path.endswith(".txt") + ultrastar_class = None real_bpm = None (title, artist, year, genre) = (None, None, None, None) - if not is_audio: # Parse Ultrastar txt - print( - f"{ULTRASINGER_HEAD} {gold_highlighted('re-pitch mode')}" - ) + if settings.input_file_is_ultrastar_txt: # Parse Ultrastar txt ( basename_without_ext, real_bpm, @@ -299,33 +386,40 @@ def run() -> None: ultrastar_audio_input_path, ultrastar_class, ) = parse_ultrastar_txt() + + if not ultrastar_class.mp3: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} The provided text file does not have a reference to " + f"an audio file." + ) + exit(1) elif settings.input_file_path.startswith("https:"): # Youtube - print( - f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}" - ) + print(f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}") ( basename_without_ext, song_output, ultrastar_audio_input_path, - (title, artist, year, genre) + (title, artist, year, genre), ) = download_from_youtube() else: # Audio File - print( - f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}" - ) + print(f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}") ( basename_without_ext, song_output, ultrastar_audio_input_path, - (title, artist, year, genre) + (title, artist, year, genre), ) = infos_from_audio_input_file() - cache_path = os.path.join(song_output, "cache") + cache_path = ( + os.path.join(song_output, "cache") + if settings.cache_override_path is None + else settings.cache_override_path + ) settings.processing_audio_path = os.path.join( cache_path, basename_without_ext + ".wav" ) - os_helper.create_folder(cache_path) + os_helper.create_folder(cache_path) # Separate vocal from audio audio_separation_path = separate_vocal_from_audio( basename_without_ext, cache_path, ultrastar_audio_input_path @@ -373,17 +467,17 @@ def run() -> None: # Audio transcription transcribed_data = None language = settings.language - if is_audio: - detected_language, transcribed_data = transcribe_audio() + if not settings.ignore_audio: + transcription_result = transcribe_audio(cache_path) if language is None: - language = detected_language + language = transcription_result.detected_language - remove_unecessary_punctuations(transcribed_data) + remove_unecessary_punctuations(transcription_result.transcribed_data) if settings.hyphenation: - hyphen_words = hyphenate_each_word(language, transcribed_data) + hyphen_words = hyphenate_each_word(language, transcription_result.transcribed_data) if hyphen_words is not None: - transcribed_data = add_hyphen_to_data(transcribed_data, hyphen_words) + transcribed_data = add_hyphen_to_data(transcription_result.transcribed_data, hyphen_words) transcribed_data = remove_silence_from_transcription_data( settings.processing_audio_path, transcribed_data @@ -393,11 +487,12 @@ def run() -> None: # lyric = 'input/faber_lyric.txt' # --corrected_words = correct_words(vosk_speech, lyric) + transcribed_data = split_syllables_into_segments(transcribed_data) + # Create audio chunks if settings.create_audio_chunks: create_audio_chunks( cache_path, - is_audio, transcribed_data, ultrastar_audio_input_path, ultrastar_class, @@ -405,7 +500,10 @@ def run() -> None: # Pitch the audio midi_segments, pitched_data, ultrastar_note_numbers, transcribed_data = pitch_audio( - is_audio, transcribed_data, ultrastar_class + transcribed_data, ultrastar_class, cache_path) + + transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments( + transcribed_data, midi_segments, ultrastar_note_numbers ) # Create plot @@ -416,7 +514,7 @@ def run() -> None: plot(pitched_data, song_output, midi_segments) # Write Ultrastar txt - if is_audio: + if not settings.ignore_audio: real_bpm, ultrastar_file_output = create_ultrastar_txt_from_automation( basename_without_ext, song_output, @@ -427,22 +525,25 @@ def run() -> None: title, artist, year, - genre + genre, ) else: ultrastar_file_output = create_ultrastar_txt_from_ultrastar_data( song_output, ultrastar_class, ultrastar_note_numbers ) - # Calc Points - ultrastar_class, simple_score, accurate_score = calculate_score_points( - is_audio, pitched_data, ultrastar_class, ultrastar_file_output - ) + simple_score = None + accurate_score = None + if settings.calculate_score: + # Calc Points + ultrastar_class, simple_score, accurate_score = calculate_score_points( + pitched_data, ultrastar_class, ultrastar_file_output + ) - # Add calculated score to Ultrastar txt #Todo: Missing Karaoke - ultrastar_writer.add_score_to_ultrastar_txt( - ultrastar_file_output, simple_score - ) + # Add calculated score to Ultrastar txt #Todo: Missing Karaoke + ultrastar_writer.add_score_to_ultrastar_txt( + ultrastar_file_output, simple_score + ) # Midi if settings.create_midi: @@ -454,6 +555,7 @@ def run() -> None: # Print Support print_support() + return ultrastar_file_output, simple_score, accurate_score def mute_no_singing_parts(mono_output_path, mute_output_path): @@ -493,58 +595,83 @@ def get_unused_song_output_dir(path: str) -> str: print( f"{ULTRASINGER_HEAD} {red_highlighted('Error: Could not create output folder! (999) is the maximum number of tries.')}" ) - sys.exit(1) + raise ValueError("Could not create output folder! (999) is the maximum number of tries.") return path -def transcribe_audio() -> (str, list[TranscribedData]): +def transcribe_audio(cache_path: str) -> TranscriptionResult: """Transcribe audio with AI""" + transcription_result = None if settings.transcriber == "whisper": - device = "cpu" if settings.force_whisper_cpu else settings.pytorch_device - transcribed_data, detected_language = transcribe_with_whisper( - settings.processing_audio_path, - settings.whisper_model, - device, - settings.whisper_align_model, - settings.whisper_batch_size, - settings.whisper_compute_type, - settings.language, - ) + transcription_config = f"{settings.transcriber}_{settings.whisper_model}_{settings.pytorch_device}_{settings.whisper_align_model}_{settings.whisper_align_model}_{settings.whisper_batch_size}_{settings.whisper_compute_type}_{settings.language}" + transcription_path = os.path.join(cache_path, f"{transcription_config}.json") + cached_transcription_available = check_file_exists(transcription_path) + if settings.skip_cache_transcription or not cached_transcription_available: + device = "cpu" if settings.force_whisper_cpu else settings.pytorch_device + transcription_result = transcribe_with_whisper( + settings.processing_audio_path, + settings.whisper_model, + device, + settings.whisper_align_model, + settings.whisper_batch_size, + settings.whisper_compute_type, + settings.language, + ) + with open(transcription_path, "w", encoding=FILE_ENCODING) as file: + file.write(transcription_result.to_json()) + else: + print( + f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached transcribed data" + ) + with open(transcription_path) as file: + json = file.read() + transcription_result = TranscriptionResult.from_json(json) else: raise NotImplementedError - return detected_language, transcribed_data + return transcription_result def separate_vocal_from_audio( - basename_without_ext: str, cache_path: str, ultrastar_audio_input_path: str + basename_without_ext: str, cache_path: str, ultrastar_audio_input_path: str ) -> str: """Separate vocal from audio""" + demucs_output_folder = os.path.splitext( + os.path.basename(ultrastar_audio_input_path) + )[0] audio_separation_path = os.path.join( - cache_path, "separated", "htdemucs", basename_without_ext + cache_path, "separated", "htdemucs", demucs_output_folder ) + vocals_path = os.path.join(audio_separation_path, "vocals.wav") + instrumental_path = os.path.join(audio_separation_path, "no_vocals.wav") if settings.use_separated_vocal or settings.create_karaoke: - separate_audio(ultrastar_audio_input_path, cache_path, settings.pytorch_device) + cache_available = check_file_exists(vocals_path) and check_file_exists( + instrumental_path + ) + if settings.skip_cache_vocal_separation or not cache_available: + separate_audio( + ultrastar_audio_input_path, cache_path, settings.pytorch_device + ) + else: + print( + f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached separated vocals" + ) return audio_separation_path def calculate_score_points( - is_audio: bool, pitched_data: PitchedData, ultrastar_class: UltrastarTxtValue, ultrastar_file_output: str + pitched_data: PitchedData, + ultrastar_class: UltrastarTxtValue, + ultrastar_file_output: str, ): """Calculate score points""" - if is_audio: - ultrastar_class = ultrastar_parser.parse_ultrastar_txt( - ultrastar_file_output - ) + if not settings.ignore_audio: + ultrastar_class = ultrastar_parser.parse_ultrastar_txt(ultrastar_file_output) ( simple_score, accurate_score, - ) = ultrastar_score_calculator.calculate_score( - pitched_data, ultrastar_class - ) - ultrastar_score_calculator.print_score_calculation( - simple_score, accurate_score - ) + ) = ultrastar_score_calculator.calculate_score(pitched_data, ultrastar_class) + ultrastar_score_calculator.print_score_calculation(simple_score, accurate_score) else: print( f"{ULTRASINGER_HEAD} {blue_highlighted('Score of original Ultrastar txt')}" @@ -552,32 +679,24 @@ def calculate_score_points( ( simple_score, accurate_score, - ) = ultrastar_score_calculator.calculate_score( - pitched_data, ultrastar_class - ) - ultrastar_score_calculator.print_score_calculation( - simple_score, accurate_score - ) + ) = ultrastar_score_calculator.calculate_score(pitched_data, ultrastar_class) + ultrastar_score_calculator.print_score_calculation(simple_score, accurate_score) print( f"{ULTRASINGER_HEAD} {blue_highlighted('Score of re-pitched Ultrastar txt')}" ) - ultrastar_class = ultrastar_parser.parse_ultrastar_txt( - ultrastar_file_output - ) + ultrastar_class = ultrastar_parser.parse_ultrastar_txt(ultrastar_file_output) ( simple_score, accurate_score, - ) = ultrastar_score_calculator.calculate_score( - pitched_data, ultrastar_class - ) - ultrastar_score_calculator.print_score_calculation( - simple_score, accurate_score - ) + ) = ultrastar_score_calculator.calculate_score(pitched_data, ultrastar_class) + ultrastar_score_calculator.print_score_calculation(simple_score, accurate_score) return ultrastar_class, simple_score, accurate_score def create_ultrastar_txt_from_ultrastar_data( - song_output: str, ultrastar_class: UltrastarTxtValue, ultrastar_note_numbers: list[int] + song_output: str, + ultrastar_class: UltrastarTxtValue, + ultrastar_note_numbers: list[int], ) -> str: """Create Ultrastar txt from Ultrastar data""" output_repitched_ultrastar = os.path.join( @@ -601,7 +720,7 @@ def create_ultrastar_txt_from_automation( title: str, artist: str, year: str, - genre: str + genre: str, ): """Create Ultrastar txt from automation""" ultrastar_header = UltrastarTxtValue() @@ -616,9 +735,7 @@ def create_ultrastar_txt_from_automation( ultrastar_header.language = language cover = basename_without_ext + " [CO].jpg" ultrastar_header.cover = ( - cover - if os_helper.check_file_exists(os.path.join(song_output, cover)) - else None + cover if os_helper.check_file_exists(os.path.join(song_output, cover)) else None ) ultrastar_header.creator = f"{ultrastar_header.creator} {Settings.APP_VERSION}" ultrastar_header.comment = f"{ultrastar_header.comment} {Settings.APP_VERSION}" @@ -634,9 +751,7 @@ def create_ultrastar_txt_from_automation( ultrastar_header.genre = format_separated_string(genre) real_bpm = get_bpm_from_file(ultrastar_audio_input_path) - ultrastar_file_output = os.path.join( - song_output, basename_without_ext + ".txt" - ) + ultrastar_file_output = os.path.join(song_output, basename_without_ext + ".txt") ultrastar_writer.create_ultrastar_txt_from_automation( transcribed_data, ultrastar_note_numbers, @@ -715,9 +830,17 @@ def infos_from_audio_input_file() -> tuple[str, str, str, tuple[str, str, str, s song_output = get_unused_song_output_dir(song_output) os_helper.create_folder(song_output) os_helper.copy(settings.input_file_path, song_output) - os_helper.rename(os.path.join(song_output, os.path.basename(settings.input_file_path)), os.path.join(song_output, basename)) + os_helper.rename( + os.path.join(song_output, os.path.basename(settings.input_file_path)), + os.path.join(song_output, basename), + ) ultrastar_audio_input_path = os.path.join(song_output, basename) - return basename_without_ext, song_output, ultrastar_audio_input_path, (title, artist, year_info, genre_info) + return ( + basename_without_ext, + song_output, + ultrastar_audio_input_path, + (title, artist, year_info, genre_info), + ) FILENAME_REPLACEMENTS = (('?:"', ""), ("<", "("), (">", ")"), ("/\\|*", "-")) @@ -738,7 +861,9 @@ def download_from_youtube() -> tuple[str, str, str, tuple[str, str, str, str]]: (artist, title) = get_youtube_title(settings.input_file_path) # Get additional data for song - (title_info, artist_info, year_info, genre_info) = get_music_infos(f"{artist} - {title}") + (title_info, artist_info, year_info, genre_info) = get_music_infos( + f"{artist} - {title}" + ) if title_info is not None: title = title_info @@ -749,29 +874,29 @@ def download_from_youtube() -> tuple[str, str, str, tuple[str, str, str, str]]: song_output = os.path.join(settings.output_file_path, basename_without_ext) song_output = get_unused_song_output_dir(song_output) os_helper.create_folder(song_output) - download_youtube_audio( - settings.input_file_path, basename_without_ext, song_output - ) - download_youtube_video( - settings.input_file_path, basename_without_ext, song_output - ) + download_youtube_audio(settings.input_file_path, basename_without_ext, song_output) + download_youtube_video(settings.input_file_path, basename_without_ext, song_output) download_youtube_thumbnail( settings.input_file_path, basename_without_ext, song_output ) ultrastar_audio_input_path = os.path.join(song_output, basename) - return basename_without_ext, song_output, ultrastar_audio_input_path, (title, artist, year_info, genre_info) + return ( + basename_without_ext, + song_output, + ultrastar_audio_input_path, + (title, artist, year_info, genre_info), + ) def parse_ultrastar_txt() -> tuple[str, float, str, str, UltrastarTxtValue]: """Parse Ultrastar txt""" - ultrastar_class = ultrastar_parser.parse_ultrastar_txt( - settings.input_file_path - ) + ultrastar_class = ultrastar_parser.parse_ultrastar_txt(settings.input_file_path) real_bpm = ultrastar_converter.ultrastar_bpm_to_real_bpm( float(ultrastar_class.bpm.replace(",", ".")) ) ultrastar_mp3_name = ultrastar_class.mp3 - basename_without_ext = os.path.splitext(ultrastar_mp3_name)[0] + + basename_without_ext = f"{ultrastar_class.artist} - {ultrastar_class.title}" dirname = os.path.dirname(settings.input_file_path) ultrastar_audio_input_path = os.path.join(dirname, ultrastar_mp3_name) song_output = os.path.join( @@ -790,37 +915,58 @@ def parse_ultrastar_txt() -> tuple[str, float, str, str, UltrastarTxtValue]: ) -def create_midi_file(real_bpm: float, - song_output: str, - ultrastar_class: UltrastarTxtValue, - basename_without_ext: str) -> None: +def create_midi_file( + real_bpm: float, + song_output: str, + ultrastar_class: UltrastarTxtValue, + basename_without_ext: str, +) -> None: """Create midi file""" - print( - f"{ULTRASINGER_HEAD} Creating Midi with {blue_highlighted('pretty_midi')}" - ) + print(f"{ULTRASINGER_HEAD} Creating Midi with {blue_highlighted('pretty_midi')}") voice_instrument = [ convert_ultrastar_to_midi_instrument(ultrastar_class) ] midi_output = os.path.join(song_output, f"{basename_without_ext}.mid") - instruments_to_midi( - voice_instrument, real_bpm, midi_output - ) + instruments_to_midi(voice_instrument, real_bpm, midi_output) -def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue) -> tuple[ - list[MidiSegment], PitchedData, list[int], list[TranscribedData]]: +def pitch_audio( + transcribed_data: list[TranscribedData], + ultrastar_class: UltrastarTxtValue, + cache_path: str, +) -> tuple[list[MidiSegment], PitchedData, list[int], list[TranscribedData]]: """Pitch audio""" # todo: chunk pitching as option? # midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name) - device = "cpu" if settings.force_crepe_cpu else settings.tensorflow_device - pitched_data = get_pitch_with_crepe_file( - settings.processing_audio_path, - settings.crepe_model_capacity, - settings.crepe_step_size, - device, - ) - if is_audio: + + pitching_config = f"crepe_{settings.ignore_audio}_{settings.crepe_model_capacity}_{settings.crepe_step_size}_{settings.tensorflow_device}_{settings.pitch_loudness_threshold}" + pitched_data_path = os.path.join(cache_path, f"{pitching_config}.json") + cache_available = check_file_exists(pitched_data_path) + pitched_data = None + + if settings.skip_cache_transcription or not cache_available: + device = "cpu" if settings.force_crepe_cpu else settings.tensorflow_device + pitched_data = get_pitch_with_crepe_file( + settings.processing_audio_path, + settings.crepe_model_capacity, + settings.crepe_step_size, + device, + settings.pitch_loudness_threshold, + ) + + pitched_data_json = pitched_data.to_json() + with open(pitched_data_path, "w", encoding=FILE_ENCODING) as file: + file.write(pitched_data_json) + else: + print( + f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached pitch data" + ) + with open(pitched_data_path) as file: + json = file.read() + pitched_data = PitchedData.from_json(json) + + if not settings.ignore_audio: start_times = [] end_times = [] words = [] @@ -838,24 +984,21 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast new_transcribed_data = [] for i, midi_segment in enumerate(midi_segments): - new_transcribed_data.append(TranscribedData({"word": midi_segment.word, "start": midi_segment.start, "end": midi_segment.end, "is_hyphen": None, "confidence": 1})) + new_transcribed_data.append(TranscribedData(word=midi_segment.word, start=midi_segment.start, end=midi_segment.end, is_hyphen=None, confidence=1)) return midi_segments, pitched_data, ultrastar_note_numbers, new_transcribed_data def create_audio_chunks( cache_path: str, - is_audio: bool, transcribed_data: list[TranscribedData], ultrastar_audio_input_path: str, - ultrastar_class: UltrastarTxtValue + ultrastar_class: UltrastarTxtValue, ) -> None: """Create audio chunks""" - audio_chunks_path = os.path.join( - cache_path, settings.audio_chunk_folder_name - ) + audio_chunks_path = os.path.join(cache_path, settings.audio_chunk_folder_name) os_helper.create_folder(audio_chunks_path) - if is_audio: # and csv + if not settings.ignore_audio: # and csv csv_filename = os.path.join(audio_chunks_path, "_chunks.csv") export_chunks_from_transcribed_data( settings.processing_audio_path, transcribed_data, audio_chunks_path @@ -870,6 +1013,19 @@ def denoise_vocal_audio(input_path: str, output_path: str) -> None: """Denoise vocal audio""" ffmpeg_reduce_noise(input_path, output_path) + # Fixme: Merge issue + # denoised_path = os.path.join(cache_path, basename_without_ext + "_denoised.wav") + # cache_available = check_file_exists(denoised_path) + # + # if settings.skip_cache_denoise_vocal_audio or not cache_available: + # ffmpeg_reduce_noise(settings.mono_audio_path, denoised_path) + # else: + # print( + # f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached denoised audio" + # ) + # + # settings.mono_audio_path = denoised_path + def main(argv: list[str]) -> None: """Main function""" @@ -924,6 +1080,8 @@ def init_settings(argv: list[str]) -> None: settings.create_karaoke = not arg elif opt in ("--create_audio_chunks"): settings.create_audio_chunks = arg + elif opt in ("--ignore_audio"): + settings.ignore_audio = arg in ["True", "true"] elif opt in ("--force_cpu"): settings.force_cpu = arg if settings.force_cpu: @@ -970,6 +1128,7 @@ def arg_options(): "disable_separation=", "disable_karaoke=", "create_audio_chunks=", + "ignore_audio=", "force_cpu=", "force_whisper_cpu=", "force_crepe_cpu=", diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py new file mode 100644 index 0000000..1d06292 --- /dev/null +++ b/src/UltraSingerEvaluation.py @@ -0,0 +1,182 @@ +import copy +import os +import traceback +from datetime import datetime +from pathlib import Path +from typing import List +import importlib.util + +import pandas + +import UltraSinger +from Settings import Settings +from modules.DeviceDetection.device_detection import check_gpu_support +from modules.Research.TestRun import TestRun, TestedSong +from modules.Research.TestSong import TestSong +from modules.Ultrastar import ultrastar_parser +from modules.Ultrastar.ultrastar_converter import compare_pitches +from modules.Ultrastar.ultrastar_parser import parse_ultrastar_txt +from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue, FILE_ENCODING +from modules.console_colors import ULTRASINGER_HEAD, red_highlighted + +test_input_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_input")) +test_output_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_output")) +test_start_time = datetime.now() +test_run_folder = os.path.join( + test_output_folder, test_start_time.strftime("%Y-%m-%d_%H-%M-%S") +) +test_run_songs_folder = os.path.join(test_run_folder, "songs") + + +def main() -> None: + """Main function""" + Path(test_input_folder).mkdir(parents=True, exist_ok=True) + Path(test_output_folder).mkdir(parents=True, exist_ok=True) + Path(test_run_folder).mkdir(parents=True) + Path(test_run_songs_folder).mkdir(parents=True) + + base_settings = initialize_settings() + base_settings.output_file_path = test_run_songs_folder + + base_settings.test_songs_input_folder = os.path.normpath( + base_settings.test_songs_input_folder + ) + if not os.path.isdir(base_settings.test_songs_input_folder): + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} No test songs input folder configured (refer to " + f"evaluation section in readme)." + ) + exit(1) + + test_songs: List[TestSong] = [] + for dir_entry in os.listdir(base_settings.test_songs_input_folder): + song_folder = os.path.join(base_settings.test_songs_input_folder, dir_entry) + found_song = find_ultrastar_song(song_folder) + if found_song is None: + continue + + test_songs.append(TestSong(found_song[0], song_folder, found_song[1])) + + if len(test_songs) == 0: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} No test songs found in {base_settings.test_songs_input_folder}." + ) + exit(1) + + print(f"{ULTRASINGER_HEAD} Running evaluation for {len(test_songs)} songs") + + test_run = TestRun(base_settings, test_start_time) + for index, test_song in enumerate(test_songs): + print(f"{ULTRASINGER_HEAD} ========================") + print( + f"{ULTRASINGER_HEAD} {index+1}/{len(test_songs)}: {os.path.basename(test_song.input_txt)}" + ) + + # prepare cache directory + song_cache_path = os.path.join(test_song.input_folder, "cache") + Path(song_cache_path).mkdir(parents=True, exist_ok=True) + + test_song_settings = copy.deepcopy(base_settings) + test_song_settings.input_file_path = test_song.input_txt + test_song_settings.cache_override_path = song_cache_path + UltraSinger.settings = test_song_settings + + tested_song = TestedSong(test_song.input_txt) + test_run.tested_songs.append(tested_song) + try: + output_txt, _, _ = UltraSinger.run() + except Exception as error: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} Failed to process {test_song.input_txt}\n{error}." + ) + traceback.print_exc() + continue + + + output_folder_name = f"{test_song.input_ultrastar_class.artist} - {test_song.input_ultrastar_class.title}" + output_folder = os.path.join(test_run_songs_folder, output_folder_name) + + if not os.path.isfile(output_txt): + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} Could not find song txt in '{output_folder}'." + ) + test_run.tested_songs.append(tested_song) + continue + + ultrastar_class = parse_ultrastar_txt(output_txt) + ( + input_match_ratio, + output_match_ratio, + input_pitch_shift_match_ratios, + output_pitch_shift_match_ratios, + pitch_where_should_be_no_pitch_ratio, + no_pitch_where_should_be_pitch_ratio, + ) = compare_pitches(test_song.input_ultrastar_class, ultrastar_class) + + tested_song.output_path = output_txt + tested_song.success = True + tested_song.input_match_ratio = input_match_ratio + tested_song.output_match_ratio = output_match_ratio + tested_song.input_pitch_shift_match_ratios = input_pitch_shift_match_ratios + tested_song.output_pitch_shift_match_ratios = output_pitch_shift_match_ratios + tested_song.pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch_ratio + tested_song.no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch_ratio + + test_run.end_time = datetime.now() + test_run_result_file = os.path.join(test_run_folder, "run.json") + test_run_json = test_run.to_json() + with open(test_run_result_file, "w", encoding=FILE_ENCODING) as file: + file.write(test_run_json) + + +def find_ultrastar_song( + song_folder, require_audio: bool = True +) -> tuple[str, UltrastarTxtValue]: + if os.path.isdir(song_folder): + for song_folder_item in os.listdir(song_folder): + if ( + song_folder_item.endswith(".txt") + and song_folder_item != "license.txt" + and not song_folder_item.endswith("[Karaoke].txt") + and not song_folder_item.endswith("[MULTI].txt") + and not song_folder_item.endswith("[DUET].txt") + and not song_folder_item.endswith("instrumental.txt") + ): + txt_file = os.path.join(song_folder, song_folder_item) + ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file) + + if ultrastar_class.mp3 != "" or not require_audio: + return txt_file, ultrastar_class + else: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Warning.')} {song_folder} contains an UltraStar text file but has no audio referenced in it. Skipping." + ) + + +def initialize_settings(): + s = Settings() + user_config_file = os.path.normpath( + os.path.join(test_input_folder, "config/local.py") + ) + if os.path.isfile(user_config_file): + print( + f"{ULTRASINGER_HEAD} Using custom settings found under {user_config_file}" + ) + + spec = importlib.util.spec_from_file_location( + "custom_settings", user_config_file + ) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + s = module.user_settings + else: + print(f"{ULTRASINGER_HEAD} No custom settings found under {user_config_file}") + + if not s.force_cpu: + s.tensorflow_device, s.pytorch_device = check_gpu_support() + return s + + +if __name__ == "__main__": + main() diff --git a/src/UltraSingerMetaEvaluation.py b/src/UltraSingerMetaEvaluation.py new file mode 100644 index 0000000..d26da4b --- /dev/null +++ b/src/UltraSingerMetaEvaluation.py @@ -0,0 +1,109 @@ +import os +from pathlib import Path +from typing import List + +import pandas + +from modules.Research.TestRun import TestRun +from modules.console_colors import ULTRASINGER_HEAD, red_highlighted + +test_input_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_input")) +test_output_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_output")) + + +def main() -> None: + """Main function""" + Path(test_output_folder).mkdir(parents=True, exist_ok=True) + + test_runs: List[TestRun] = [] + for dir_entry in os.listdir(test_output_folder): + test_run_folder = os.path.join(test_output_folder, dir_entry) + test_run = find_test_run_result(test_run_folder) + if test_run is None: + continue + + test_runs.append(test_run) + + if len(test_runs) == 0: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} No test runs found in {test_output_folder}." + ) + exit(1) + + print(f"{ULTRASINGER_HEAD} Running meta evaluation for {len(test_runs)} test runs") + + for test_run in test_runs: + tested_songs_dicts = [] + for tested_song in [s for s in test_run.tested_songs if s.success]: + tested_song_dict = tested_song.to_dict() + + best_input_pitch_shift_match_ratio = max( + tested_song.input_pitch_shift_match_ratios.values() + ) + + # based on the pitch shift of the highest input_pitch_shift_match_ratio picked previously + # we pick the corresponding value of output_pitch_shift_match_ratios + matching_input_best_output_pitch_shift_match_ratio = ( + tested_song.output_pitch_shift_match_ratios[ + list(tested_song.input_pitch_shift_match_ratios.values()).index( + best_input_pitch_shift_match_ratio + ) + ] + ) + + best_output_pitch_shift_match_ratio = max( + tested_song.output_pitch_shift_match_ratios.values() + ) + + # based on the pitch shift of the highest output_pitch_shift_match_ratio picked previously + # we pick the corresponding value of input_pitch_shift_match_ratios + matching_output_best_input_pitch_shift_match_ratio = ( + tested_song.input_pitch_shift_match_ratios[ + list(tested_song.output_pitch_shift_match_ratios.values()).index( + best_output_pitch_shift_match_ratio + ) + ] + ) + + tested_song_dict[ + "best_input_pitch_shift_match_ratio" + ] = best_input_pitch_shift_match_ratio + tested_song_dict[ + "matching_input_best_output_pitch_shift_match_ratio" + ] = matching_input_best_output_pitch_shift_match_ratio + tested_song_dict[ + "best_output_pitch_shift_match_ratio" + ] = best_output_pitch_shift_match_ratio + tested_song_dict[ + "matching_output_best_input_pitch_shift_match_ratio" + ] = matching_output_best_input_pitch_shift_match_ratio + + tested_songs_dicts.append(tested_song_dict) + + records = pandas.DataFrame.from_records(tested_songs_dicts) + pandas.options.display.max_columns = records.shape[1] + describe_result = records.describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99]) + print(describe_result) + + print("Done") + + +def find_test_run_result(test_run_folder) -> TestRun: + if os.path.isdir(test_run_folder): + for test_run_folder_item in os.listdir(test_run_folder): + test_run_folder_item_path = os.path.join( + test_run_folder, test_run_folder_item + ) + if ( + os.path.isfile(test_run_folder_item_path) + and test_run_folder_item == "run.json" + ): + test_run = None + with open(test_run_folder_item_path) as file: + json = file.read() + test_run = TestRun.from_json(json) + return test_run + + +if __name__ == "__main__": + main() diff --git a/src/modules/Audio/separation.py b/src/modules/Audio/separation.py index e6d0485..55335ac 100644 --- a/src/modules/Audio/separation.py +++ b/src/modules/Audio/separation.py @@ -1,7 +1,10 @@ """Separate vocals from audio""" - +import os +import shlex import subprocess +import demucs.separate + from modules.console_colors import ( ULTRASINGER_HEAD, blue_highlighted, @@ -10,16 +13,20 @@ from modules.os_helper import current_executor_path, move, path_join -def separate_audio(input_file_path: str, output_file: str, device="cpu") -> None: +def separate_audio(input_file_path: str, output_folder: str, device="cpu") -> None: """Separate vocals from audio with demucs.""" print( f"{ULTRASINGER_HEAD} Separating vocals from audio with {blue_highlighted('demucs')} and {red_highlighted(device)} as worker." ) + # Model selection? # -n htdemucs_ft - subprocess.run( - ["demucs", "-d", device, "--two-stems=vocals", "--float32", input_file_path] - ) - separated_folder = path_join(current_executor_path(), "separated") - move(separated_folder, output_file) \ No newline at end of file + # subprocess.run( + # ["demucs", "-d", device, "--two-stems=vocals", "--float32", input_file_path] + # ) + # separated_folder = path_join(current_executor_path(), "separated") + # move(separated_folder, output_file) + + # fixme "--float32" is missing + demucs.separate.main(shlex.split(f'--two-stems vocals -d {device} --out "{os.path.join(output_folder, "separated")}" "{input_file_path}"')) diff --git a/src/modules/Audio/silence_processing.py b/src/modules/Audio/silence_processing.py index da11172..46f9fea 100644 --- a/src/modules/Audio/silence_processing.py +++ b/src/modules/Audio/silence_processing.py @@ -63,7 +63,7 @@ def remove_silence(silence_parts_list: list[tuple[float, float]], transcribed_da split_word = "~ " is_word_end = True - split_data = TranscribedData({"conf": data.conf, "word": split_word, "end": split_end, "start": silence_end, "is_word_end": is_word_end}) + split_data = TranscribedData({"conf": data.confidence, "word": split_word, "end": split_end, "start": silence_end, "is_word_end": is_word_end}) if not was_split: data.end = silence_start diff --git a/src/modules/Pitcher/core.py b/src/modules/Pitcher/core.py new file mode 100644 index 0000000..7b252a6 --- /dev/null +++ b/src/modules/Pitcher/core.py @@ -0,0 +1 @@ +CREPE_MODEL_SAMPLE_RATE = 16000 \ No newline at end of file diff --git a/src/modules/Pitcher/loudness.py b/src/modules/Pitcher/loudness.py new file mode 100644 index 0000000..f72d8a5 --- /dev/null +++ b/src/modules/Pitcher/loudness.py @@ -0,0 +1,69 @@ +import warnings + +import librosa +import numpy as np +from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE + +############################################################################### +# Constants +############################################################################### + +WINDOW_SIZE = 1024 +TIMES_DECIMAL_PLACES: int = 3 +# Minimum decibel level +MIN_DB = -100. + +# Reference decibel level +REF_DB = 20. + +def set_confidence_to_zero_in_silent_regions(confidence, audio, threshold=-60, step_size=10, pad=True): + # Don't modify in-place + confidence = confidence[:] + + # Compute loudness + loudness = a_weighted(audio, step_size, pad) + + # Threshold silence + confidence[loudness < threshold] = 0. + + return confidence, loudness + +def a_weighted(audio, step_size=10, pad=True): + """Retrieve the per-frame loudness""" + step_size_seconds = round(step_size / 1000, TIMES_DECIMAL_PLACES) + steps_per_second = 1 / step_size_seconds + hop_length = int(CREPE_MODEL_SAMPLE_RATE // steps_per_second) + + a_perceptual_weights = perceptual_weights() + + # Take stft + stft = librosa.stft(audio, + n_fft=WINDOW_SIZE, + hop_length=hop_length, + win_length=WINDOW_SIZE, + center=pad, + pad_mode='constant') + + # Compute magnitude on db scale + db = librosa.amplitude_to_db(np.abs(stft)) + + # Apply A-weighting + weighted = db + a_perceptual_weights + + # Threshold + weighted[weighted < MIN_DB] = MIN_DB + + # Average over weighted frequencies + return weighted.mean(axis=0) + + +def perceptual_weights(): + """A-weighted frequency-dependent perceptual loudness weights""" + frequencies = librosa.fft_frequencies(sr=CREPE_MODEL_SAMPLE_RATE, + n_fft=WINDOW_SIZE) + + # A warning is raised for nearly inaudible frequencies, but it ends up + # defaulting to -100 db. That default is fine for our purposes. + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + return librosa.A_weighting(frequencies)[:, None] - REF_DB \ No newline at end of file diff --git a/src/modules/Pitcher/pitched_data.py b/src/modules/Pitcher/pitched_data.py index 13d828c..3edb088 100644 --- a/src/modules/Pitcher/pitched_data.py +++ b/src/modules/Pitcher/pitched_data.py @@ -1,7 +1,10 @@ """Pitched data""" from dataclasses import dataclass +from dataclasses_json import dataclass_json + +@dataclass_json @dataclass class PitchedData: """Pitched data from crepe""" @@ -9,3 +12,4 @@ class PitchedData: times: list[float] frequencies: list[float] confidence: list[float] + perceived_loudness_db: list[float] diff --git a/src/modules/Pitcher/pitcher.py b/src/modules/Pitcher/pitcher.py index 4312c07..efc9f17 100644 --- a/src/modules/Pitcher/pitcher.py +++ b/src/modules/Pitcher/pitcher.py @@ -1,43 +1,59 @@ """Pitcher module""" import crepe -from scipy.io import wavfile +import librosa from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted +from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE +from modules.Pitcher.loudness import set_confidence_to_zero_in_silent_regions from modules.Pitcher.pitched_data import PitchedData +import modules.timer as timer def get_pitch_with_crepe_file( - filename: str, model_capacity: str, step_size: int = 10, device: str = "cpu" + filename: str, model_capacity: str, step_size: int = 10, device: str = "cpu", filter_silence_threshold: int = -60 ) -> PitchedData: """Pitch with crepe""" print( f"{ULTRASINGER_HEAD} Pitching with {blue_highlighted('crepe')} and model {blue_highlighted(model_capacity)} and {red_highlighted(device)} as worker" ) - sample_rate, audio = wavfile.read(filename) + timer.log('Load file for pitch detection start') + audio, sample_rate = librosa.load(filename) + timer.log('Load file for pitch detection end') - return get_pitch_with_crepe(audio, sample_rate, model_capacity, step_size) + return get_pitch_with_crepe(audio, sample_rate, model_capacity, step_size, filter_silence_threshold) -def get_pitch_with_crepe( - audio, sample_rate: int, model_capacity: str, step_size: int = 10 -) -> PitchedData: +def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size: int = 10, filter_silence_threshold: int = -60) -> PitchedData: """Pitch with crepe""" + + if sample_rate != CREPE_MODEL_SAMPLE_RATE: + from resampy import resample + audio = resample(audio, sample_rate, CREPE_MODEL_SAMPLE_RATE) + sample_rate = CREPE_MODEL_SAMPLE_RATE + + timer.log('Crepe pitch detection start') # Info: The model is trained on 16 kHz audio, so if the input audio has a different sample rate, it will be first resampled to 16 kHz using resampy inside crepe. + times, frequencies, confidence, activation = crepe.predict(audio, sample_rate, model_capacity, step_size=step_size, viterbi=True) + timer.log('Crepe pitch detection end') - times, frequencies, confidence, activation = crepe.predict( - audio, sample_rate, model_capacity, step_size=step_size, viterbi=True - ) - return PitchedData(times, frequencies, confidence) + timer.log('Computing loudness start') + confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, threshold=filter_silence_threshold, step_size=step_size) + timer.log('Computing loudness end') + + # convert to native float for serialization + confidence = [float(x) for x in confidence] + + return PitchedData(times, frequencies, confidence, perceived_loudness) def get_pitched_data_with_high_confidence( pitched_data: PitchedData, threshold=0.4 ) -> PitchedData: """Get frequency with high confidence""" - new_pitched_data = PitchedData([], [], []) + new_pitched_data = PitchedData([], [], [], []) for i, conf in enumerate(pitched_data.confidence): if conf > threshold: new_pitched_data.times.append(pitched_data.times[i]) diff --git a/src/modules/Research/TestRun.py b/src/modules/Research/TestRun.py new file mode 100644 index 0000000..ed573a8 --- /dev/null +++ b/src/modules/Research/TestRun.py @@ -0,0 +1,33 @@ +import datetime +from dataclasses import dataclass, field + +from dataclasses_json import dataclass_json + +from Settings import Settings + + +@dataclass_json +@dataclass +class TestedSong: + """Tested song""" + + input_path: str + output_path: str = "" + success: bool = False + input_match_ratio: float = 0.0 + output_match_ratio: float = 0.0 + input_pitch_shift_match_ratios: dict[int, float] = field(default_factory=lambda: {}) + output_pitch_shift_match_ratios: dict[int, float] = field(default_factory=lambda: {}) + no_pitch_where_should_be_pitch_ratio: float = 0.0 + pitch_where_should_be_no_pitch_ratio: float = 0.0 + + +@dataclass_json +@dataclass +class TestRun: + """Test run""" + + settings: Settings + start_time: datetime.datetime = None + end_time: datetime.datetime = None + tested_songs: list[TestedSong] = field(default_factory=lambda: []) diff --git a/src/modules/Research/TestSong.py b/src/modules/Research/TestSong.py new file mode 100644 index 0000000..be3a1b4 --- /dev/null +++ b/src/modules/Research/TestSong.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass + +from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue + + +@dataclass +class TestSong: + """Test song""" + + input_txt: str + input_folder: str + input_ultrastar_class: UltrastarTxtValue diff --git a/src/modules/Speech_Recognition/TranscribedData.py b/src/modules/Speech_Recognition/TranscribedData.py index 6ff4ee3..465ef75 100644 --- a/src/modules/Speech_Recognition/TranscribedData.py +++ b/src/modules/Speech_Recognition/TranscribedData.py @@ -1,21 +1,29 @@ -"""Docstring""" +from dataclasses import dataclass +from dataclasses_json import dataclass_json + +@dataclass_json +@dataclass class TranscribedData: """Transcribed data from json file""" - def __init__(self, transcribed_json = None): - if transcribed_json is None: - return - # Vosk = conf, Whisper = confidence - self.conf = transcribed_json.get( - "conf", transcribed_json.get("confidence", None) - ) - # Vosk = word, Whisper = text - self.word = transcribed_json.get( - "word", transcribed_json.get("text", None) - ) - self.end = transcribed_json.get("end", None) - self.start = transcribed_json.get("start", None) - self.is_hyphen = transcribed_json.get("is_hyphen", None) - self.is_word_end = transcribed_json.get("is_word_end", True) + confidence: float = 0 + word: str = "" + start: float = 0 + end: float = 0 + is_hyphen: bool = False + is_word_end: bool = True + + +def from_whisper(whisper_dict) -> TranscribedData: + transcribed_data = TranscribedData() + if "score" in whisper_dict: + transcribed_data.confidence = whisper_dict["score"] + if "word" in whisper_dict: + transcribed_data.word = whisper_dict["word"] + if "start" in whisper_dict: + transcribed_data.start = whisper_dict["start"] + if "end" in whisper_dict: + transcribed_data.end = whisper_dict["end"] + return transcribed_data diff --git a/src/modules/Speech_Recognition/TranscriptionResult.py b/src/modules/Speech_Recognition/TranscriptionResult.py new file mode 100644 index 0000000..1fa055f --- /dev/null +++ b/src/modules/Speech_Recognition/TranscriptionResult.py @@ -0,0 +1,14 @@ +from dataclasses import dataclass + +from dataclasses_json import dataclass_json + +from modules.Speech_Recognition.TranscribedData import TranscribedData + + +@dataclass_json +@dataclass +class TranscriptionResult: + """Transcription result""" + + transcribed_data: list[TranscribedData] + detected_language: str diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py index abcbb96..9cb228d 100644 --- a/src/modules/Speech_Recognition/Whisper.py +++ b/src/modules/Speech_Recognition/Whisper.py @@ -2,11 +2,16 @@ import sys +import torch import whisperx from torch.cuda import OutOfMemoryError +from modules.Speech_Recognition.TranscriptionResult import TranscriptionResult from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted -from modules.Speech_Recognition.TranscribedData import TranscribedData +from modules.Speech_Recognition.TranscribedData import TranscribedData, from_whisper + + +MEMORY_ERROR_MESSAGE = f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu" def transcribe_with_whisper( @@ -17,7 +22,7 @@ def transcribe_with_whisper( batch_size: int = 16, compute_type: str = None, language: str = None, -) -> (list[TranscribedData], str): +) -> TranscriptionResult: """Transcribe with whisper""" # Info: Regardless of the audio sampling rate used in the original audio file, whisper resample the audio signal to 16kHz (via ffmpeg). So the standard input from (44.1 or 48 kHz) should work. @@ -32,27 +37,36 @@ def transcribe_with_whisper( compute_type = "float16" if device == "cuda" else "int8" try: + torch.cuda.empty_cache() + asr_options = { + "max_new_tokens": None, + "clip_timestamps": None, + "hallucination_silence_threshold": None + } + loaded_whisper_model = whisperx.load_model( - model, language=language, device=device, compute_type=compute_type + model, asr_options=asr_options, language=language, device=device, compute_type=compute_type ) except ValueError as value_error: if ( - "Requested float16 compute type, but the target device or backend do not support efficient float16 computation." - in str(value_error.args[0]) + "Requested float16 compute type, but the target device or backend do not support efficient float16 computation." + in str(value_error.args[0]) ): print(value_error) print( f"{ULTRASINGER_HEAD} Your GPU does not support efficient float16 computation; run UltraSinger with '--whisper_compute_type int8'" ) - sys.exit(1) raise value_error except OutOfMemoryError as oom_exception: print(oom_exception) - print( - f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu" - ) - sys.exit(1) + print(MEMORY_ERROR_MESSAGE) + raise oom_exception + except Exception as exception: + if "CUDA failed with error out of memory" in str(exception.args[0]): + print(exception) + print(MEMORY_ERROR_MESSAGE) + raise exception audio = whisperx.load_audio(audio_path) @@ -78,7 +92,7 @@ def transcribe_with_whisper( f"{ULTRASINGER_HEAD} {red_highlighted('Error:')} Unknown language. " f"Try add it with --align_model [huggingface]." ) - sys.exit(1) + raise ve # align whisper output result_aligned = whisperx.align( @@ -92,20 +106,18 @@ def transcribe_with_whisper( transcribed_data = convert_to_transcribed_data(result_aligned) - return transcribed_data, detected_language + return TranscriptionResult(transcribed_data, detected_language) + def convert_to_transcribed_data(result_aligned): transcribed_data = [] for segment in result_aligned["segments"]: for obj in segment["words"]: - vtd = TranscribedData(obj) # create custom Word object + vtd = from_whisper(obj) # create custom Word object vtd.word = vtd.word + " " # add space to end of word if len(obj) < 4: - previous = transcribed_data[-1] - if not previous: - previous.end = 0 - previous.end = "" + previous = transcribed_data[-1] if len(transcribed_data) != 0 else TranscribedData() vtd.start = previous.end + 0.1 vtd.end = previous.end + 0.2 msg = f'Error: There is no timestamp for word: "{obj["word"]}". ' \ diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py index d9978eb..795c20c 100644 --- a/src/modules/Ultrastar/ultrastar_converter.py +++ b/src/modules/Ultrastar/ultrastar_converter.py @@ -1,7 +1,13 @@ """Ultrastar Converter""" +from typing import Tuple + +import numpy from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue +NO_PITCH = -1000 + + def real_bpm_to_ultrastar_bpm(real_bpm: float) -> float: """Converts real BPM to UltraStar BPM""" # The UltraStar BPM info is a fourth beat of the real BPM @@ -48,32 +54,135 @@ def ultrastar_note_to_midi_note(ultrastar_note: int) -> int: return midi_note -def get_start_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float: +def get_start_time_from_ultrastar( + ultrastar_class: UltrastarTxtValue, pos: int +) -> float: """Calculates the start time from the Ultrastar txt""" - gap = int(ultrastar_class.gap) / 1000 - real_bpm = ultrastar_bpm_to_real_bpm( - float(ultrastar_class.bpm.replace(",", ".")) - ) - start_time = ( - beat_to_second(int(ultrastar_class.startBeat[pos]), real_bpm) + gap - ) + gap = float(ultrastar_class.gap.replace(",", ".")) / 1000 + real_bpm = ultrastar_bpm_to_real_bpm(float(ultrastar_class.bpm.replace(",", "."))) + start_time = beat_to_second(int(ultrastar_class.startBeat[pos]), real_bpm) + gap return start_time def get_end_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float: """Calculates the end time from the Ultrastar txt""" - gap = int(ultrastar_class.gap) / 1000 - real_bpm = ultrastar_bpm_to_real_bpm( - float(ultrastar_class.bpm.replace(",", ".")) - ) + gap = float(ultrastar_class.gap.replace(",", ".")) / 1000 + real_bpm = ultrastar_bpm_to_real_bpm(float(ultrastar_class.bpm.replace(",", "."))) end_time = ( beat_to_second( - int(ultrastar_class.startBeat[pos]) - + int(ultrastar_class.durations[pos]), + int(ultrastar_class.startBeat[pos]) + int(ultrastar_class.durations[pos]), real_bpm, ) + gap ) return end_time + + +def map_to_datapoints( + ultrastar_class: UltrastarTxtValue, step_size: int = 10 +) -> list[int]: + gap = float(ultrastar_class.gap.replace(",", ".")) + + data = [] + + previous_step = -step_size + for pos, pitch in enumerate(ultrastar_class.pitches): + if ultrastar_class.noteType[pos] == "F": + continue + + start_time = int(get_start_time_from_ultrastar(ultrastar_class, pos) * 1000 + gap) + end_time = int(get_end_time_from_ultrastar(ultrastar_class, pos) * 1000 + gap) + + start_nearest_step = (start_time + step_size - 1) // step_size * step_size + end_nearest_step = (end_time + step_size - 1) // step_size * step_size + + if previous_step == start_nearest_step: + start_nearest_step += step_size + + duration = end_nearest_step - start_nearest_step + + if duration < 10: + continue + + # pad gaps between pitches with empty datapoints + gap_steps_count = (start_nearest_step - previous_step - step_size) // step_size + data += [NO_PITCH] * gap_steps_count + + pitch_steps_count = duration // step_size + data += [int(pitch)] * pitch_steps_count + previous_step = end_nearest_step + + return data + + +def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, dict[int, float], dict[int, float], float, float]: + step_size = 10 + + input_datapoints = map_to_datapoints(input_ultrastar_class, step_size) + output_datapoints = map_to_datapoints(output_ultrastar_class, step_size) + + longest = max(len(input_datapoints), len(output_datapoints)) + for datapoints in [input_datapoints, output_datapoints]: + length = len(datapoints) + if length < longest: + gap_steps_count = longest - length + # pad gaps between pitches with empty datapoints + datapoints += [NO_PITCH] * gap_steps_count + + input_pitched_datapoints = len([x for x in input_datapoints if x != NO_PITCH]) + output_pitched_datapoints = len([x for x in output_datapoints if x != NO_PITCH]) + + matches = 0 + pitch_shift_matches = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + pitch_where_should_be_no_pitch = 0 + no_pitch_where_should_be_pitch = 0 + for index, _ in enumerate(input_datapoints): + input_pitch = input_datapoints[index] + output_pitch = output_datapoints[index] + if input_pitch == NO_PITCH and output_pitch == NO_PITCH: + continue + + if input_pitch == output_pitch: + matches += 1 + elif input_pitch == NO_PITCH: + pitch_where_should_be_no_pitch += 1 + elif output_pitch == NO_PITCH: + no_pitch_where_should_be_pitch += 1 + else: + _, input_pitch_remainder = divmod(input_pitch, 12) + _, output_pitch_remainder = divmod(output_pitch, 12) + pitch_difference = abs(input_pitch_remainder - output_pitch_remainder) + pitch_shift_matches[pitch_difference] += 1 + + input_match_ratio = matches / input_pitched_datapoints + output_match_ratio = matches / output_pitched_datapoints + + input_pitch_shift_match_ratios = {} + output_pitch_shift_match_ratios = {} + for index, pitch_shift_matches_item in enumerate(pitch_shift_matches): + pitch_shift_matches_count = pitch_shift_matches_item + if index == 0: + pitch_shift_matches_count += matches + input_pitch_shift_match_ratios[index] = pitch_shift_matches_item / input_pitched_datapoints + output_pitch_shift_match_ratios[index] = pitch_shift_matches_item / output_pitched_datapoints + + output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints + output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints + + return (input_match_ratio, + output_match_ratio, + input_pitch_shift_match_ratios, + output_pitch_shift_match_ratios, + output_pitch_where_should_be_no_pitch_ratio, + output_no_pitch_where_should_be_pitch_ratio + ) + + +def determine_nearest_end_step(input_ultrastar_class, step_size) -> int: + pitches_count = len(input_ultrastar_class.pitches) - 1 + end_time = int( + get_end_time_from_ultrastar(input_ultrastar_class, pitches_count) * 1000 + ) + int(input_ultrastar_class.gap) + return (end_time + step_size - 1) // step_size * step_size diff --git a/src/modules/Ultrastar/ultrastar_parser.py b/src/modules/Ultrastar/ultrastar_parser.py index f3e6100..ba78cec 100644 --- a/src/modules/Ultrastar/ultrastar_parser.py +++ b/src/modules/Ultrastar/ultrastar_parser.py @@ -5,7 +5,15 @@ get_end_time_from_ultrastar, get_start_time_from_ultrastar, ) -from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue, UltrastarTxtTag, UltrastarTxtNoteTypeTag, FILE_ENCODING +from modules.Ultrastar.ultrastar_txt import ( + UltrastarTxtValue, + UltrastarTxtTag, + UltrastarTxtNoteTypeTag, + FILE_ENCODING, +) + +CHARACTERS_TO_REMOVE = ["\ufeff"] + def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue: """Parse ultrastar txt file to UltrastarTxt class""" @@ -13,35 +21,46 @@ def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue: with open(input_file, "r", encoding=FILE_ENCODING) as file: txt = file.readlines() - ultrastar_class = UltrastarTxtValue() count = 0 # Strips the newline character for line in txt: + filtered_line = line + for character_to_remove in CHARACTERS_TO_REMOVE: + filtered_line = filtered_line.replace(character_to_remove, "") count += 1 - if line.startswith("#"): - if line.startswith(f"#{UltrastarTxtTag.ARTIST}"): - ultrastar_class.artist = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.TITLE}"): - ultrastar_class.title = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.MP3}"): - ultrastar_class.mp3 = line.split(":")[1].replace("\n", "") + if filtered_line.startswith("#"): + if filtered_line.startswith(f"#{UltrastarTxtTag.ARTIST}"): + ultrastar_class.artist = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.TITLE}"): + ultrastar_class.title = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.MP3}"): + ultrastar_class.mp3 = filtered_line.split(":")[1].replace("\n", "") elif line.startswith(f"#{UltrastarTxtTag.AUDIO}"): ultrastar_class.audio = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.VIDEO}"): - ultrastar_class.video = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.GAP}"): - ultrastar_class.gap = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.BPM}"): - ultrastar_class.bpm = line.split(":")[1].replace("\n", "") - elif line.startswith(( + elif filtered_line.startswith(f"#{UltrastarTxtTag.GAP}"): + ultrastar_class.gap = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.BPM}"): + ultrastar_class.bpm = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.VIDEO}"): + ultrastar_class.video = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.VIDEOGAP}"): + ultrastar_class.videoGap = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.COVER}"): + ultrastar_class.cover = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.BACKGROUND}"): + ultrastar_class.background = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith( + ( f"{UltrastarTxtNoteTypeTag.FREESTYLE} ", f"{UltrastarTxtNoteTypeTag.NORMAL} ", f"{UltrastarTxtNoteTypeTag.GOLDEN} ", f"{UltrastarTxtNoteTypeTag.RAP} ", - f"{UltrastarTxtNoteTypeTag.RAP_GOLDEN} ")): - parts = line.split() + f"{UltrastarTxtNoteTypeTag.RAP_GOLDEN} ", + ) + ): + parts = filtered_line.split() # [0] F : * R G # [1] start beat # [2] duration diff --git a/src/modules/Ultrastar/ultrastar_score_calculator.py b/src/modules/Ultrastar/ultrastar_score_calculator.py index e0deaf5..9536598 100644 --- a/src/modules/Ultrastar/ultrastar_score_calculator.py +++ b/src/modules/Ultrastar/ultrastar_score_calculator.py @@ -1,4 +1,7 @@ """Ultrastar score calculator.""" +from dataclasses import dataclass + +from dataclasses_json import dataclass_json import librosa @@ -48,6 +51,8 @@ def add_point(note_type: str, points: Points) -> Points: return points +@dataclass_json +@dataclass class Score: """Docstring""" diff --git a/src/modules/Ultrastar/ultrastar_txt.py b/src/modules/Ultrastar/ultrastar_txt.py index 14f7cda..39d4054 100644 --- a/src/modules/Ultrastar/ultrastar_txt.py +++ b/src/modules/Ultrastar/ultrastar_txt.py @@ -19,9 +19,11 @@ class UltrastarTxtTag(str, Enum): GENRE = 'GENRE' # Multi-language support since v1.1.0 YEAR = 'YEAR' # Multi-language support since v1.1.0 COVER = 'COVER' # Path to cover. Should end with `*[CO].jpg` + BACKGROUND = 'BACKGROUND' # Path to background. Is shown when there is no video. Should end with `*[BG].jpg` CREATOR = 'CREATOR' # Multi-language support since v1.1.0 COMMENT = 'COMMENT' VIDEO = 'VIDEO' + VIDEOGAP = 'VIDEOGAP' FILE_END = 'E' LINEBREAK = '-' @@ -32,8 +34,6 @@ class UltrastarTxtTag(str, Enum): TAGS = 'TAGS' # Tags for the song. Can be used for filtering # Unused 0.2.0 - BACKGROUND = 'BACKGROUND' # Path to background. Is shown when there is no video. Should end with `*[BG].jpg` - VIDEOGAP = 'VIDEOGAP' EDITION = 'EDITION' # Multi-language support since v1.1.0 START = 'START' END = 'END' @@ -84,13 +84,15 @@ class UltrastarTxtValue: mp3 = "" audio = "" video = None - gap = "" + videoGap = None + gap = "0" bpm = "" language = None cover = None vocals = None instrumental = None tags = None + background = None creator = "UltraSinger [GitHub]" comment = "UltraSinger [GitHub]" startBeat = [] diff --git a/src/modules/Ultrastar/ultrastar_writer.py b/src/modules/Ultrastar/ultrastar_writer.py index e04e5a9..a6edb4b 100644 --- a/src/modules/Ultrastar/ultrastar_writer.py +++ b/src/modules/Ultrastar/ultrastar_writer.py @@ -99,6 +99,8 @@ def create_ultrastar_txt_from_automation( file.write(f"#{UltrastarTxtTag.GENRE}:{ultrastar_class.genre}\n") if ultrastar_class.cover is not None: file.write(f"#{UltrastarTxtTag.COVER}:{ultrastar_class.cover}\n") + if ultrastar_class.background is not None: + file.write(f"#{UltrastarTxtTag.BACKGROUND}:{ultrastar_class.background}\n") file.write(f"#{UltrastarTxtTag.MP3}:{ultrastar_class.mp3}\n") if version.parse(ultrastar_class.version) >= version.parse("1.1.0"): file.write(f"#{UltrastarTxtTag.AUDIO}:{ultrastar_class.audio}\n") @@ -108,7 +110,10 @@ def create_ultrastar_txt_from_automation( file.write(f"#{UltrastarTxtTag.INSTRUMENTAL}:{ultrastar_class.instrumental}\n") if ultrastar_class.tags is not None: file.write(f"#{UltrastarTxtTag.TAGS}:{ultrastar_class.tags}\n") - file.write(f"#{UltrastarTxtTag.VIDEO}:{ultrastar_class.video}\n") + if ultrastar_class.video is not None: + file.write(f"#{UltrastarTxtTag.VIDEO}:{ultrastar_class.video}\n") + if ultrastar_class.videoGap is not None: + file.write(f"#{UltrastarTxtTag.VIDEOGAP}:{ultrastar_class.videoGap}\n") file.write(f"#{UltrastarTxtTag.BPM}:{round(ultrastar_bpm, 2)}\n") # not the real BPM! file.write(f"#{UltrastarTxtTag.GAP}:{int(gap * 1000)}\n") file.write(f"#{UltrastarTxtTag.CREATOR}:{ultrastar_class.creator}\n") diff --git a/src/modules/console_colors.py b/src/modules/console_colors.py index c4cc6f8..e5d9375 100644 --- a/src/modules/console_colors.py +++ b/src/modules/console_colors.py @@ -8,6 +8,11 @@ def blue_highlighted(text: str) -> str: return f"{Bcolors.blue}{text}{Bcolors.endc}" +def green_highlighted(text: str) -> str: + """Returns a green highlighted text""" + return f"{Bcolors.dark_green}{text}{Bcolors.endc}" + + def gold_highlighted(text: str) -> str: """Returns a gold highlighted text""" return f"{Bcolors.gold}{text}{Bcolors.endc}" @@ -37,6 +42,7 @@ class Bcolors: """Colors for the console""" blue = "\033[94m" + dark_green = "\033[32m" red = "\033[91m" light_blue = "\033[96m" cyan = "\033[36m" diff --git a/src/modules/csv_handler.py b/src/modules/csv_handler.py index 58cceeb..3bb01f4 100644 --- a/src/modules/csv_handler.py +++ b/src/modules/csv_handler.py @@ -20,7 +20,7 @@ def export_transcribed_data_to_csv(transcribed_data: list[TranscribedData], file data.word, data.start, data.end, - data.conf, + data.confidence, ] ) diff --git a/src/modules/plot.py b/src/modules/plot.py index 49f63fa..532c313 100644 --- a/src/modules/plot.py +++ b/src/modules/plot.py @@ -176,7 +176,7 @@ def determine_bounds(frequency_log_10: list[float]) -> tuple[float, float]: def set_figure_dimensions(time_range, frequency_log_10_range): """Dynamically scale the figure dimensions based on the duration/frequency amplitude of the song""" height = frequency_log_10_range / 0.06 - width = time_range / 2 + width = time_range / 4 plt.figure(1).set_figwidth(max(6.4, width)) plt.figure(1).set_figheight(max(4, height)) @@ -188,7 +188,7 @@ def create_gaps(pitched_data: PitchedData, step_size: float) -> PitchedData: This way the graph is only continuous where it should be. """ - pitched_data_with_gaps = PitchedData([], [], []) + pitched_data_with_gaps = PitchedData([], [], [], []) previous_time = 0 for i, time in enumerate(pitched_data.times):