Merge pull request #1248 from oxygen-dioxide/diffsinger

Support Diffsinger multi-dictionary
stakira · Sep 1, 2024 · 9d574ef · 9d574ef
2 parents f2fbda0 + d49a9af
commit 9d574ef
Show file tree

Hide file tree

Showing 18 changed files with 216 additions and 31 deletions.
diff --git a/OpenUtau.Core/DiffSinger/DiffSingerBasePhonemizer.cs b/OpenUtau.Core/DiffSinger/DiffSingerBasePhonemizer.cs
@@ -17,18 +17,20 @@ public abstract class DiffSingerBasePhonemizer : MachineLearningPhonemizer
     {
         USinger singer;
         DsConfig dsConfig;
+        Dictionary<string, int>languageIds = new Dictionary<string, int>();
         string rootPath;
         float frameMs;
         ulong linguisticHash;
         ulong durationHash;
         InferenceSession linguisticModel;
         InferenceSession durationModel;
         IG2p g2p;
-        List<string> phonemes;
+        Dictionary<string, int> phonemeTokens;
         DiffSingerSpeakerEmbedManager speakerEmbedManager;
 
         string defaultPause = "SP";
         protected virtual string GetDictionaryName()=>"dsdict.yaml";
+        protected virtual string GetLangCode()=>String.Empty;//The language code of the language the phonemizer is made for
 
         public override void SetSinger(USinger singer) {
             this.singer = singer;
@@ -53,12 +55,26 @@ public override void SetSinger(USinger singer) {
                 Log.Error(e, $"failed to load dsconfig from {configPath}");
                 return;
             }
+            //Load language id if needed
+            if(dsConfig.use_lang_id){
+                if(dsConfig.languages == null){
+                    Log.Error("\"languages\" field is not specified in dsconfig.yaml");
+                    return;
+                }
+                var langIdPath = Path.Join(rootPath, dsConfig.languages);
+                try {
+                    languageIds = DiffSingerUtils.LoadLanguageIds(langIdPath);
+                } catch (Exception e) {
+                    Log.Error(e, $"failed to load language id from {langIdPath}");
+                    return;
+                }
+            }
             this.frameMs = dsConfig.frameMs();
             //Load g2p
             g2p = LoadG2p(rootPath);
             //Load phonemes list
             string phonemesPath = Path.Combine(rootPath, dsConfig.phonemes);
-            phonemes = File.ReadLines(phonemesPath,singer.TextFileEncoding).ToList();
+            phonemeTokens = DiffSingerUtils.LoadPhonemes(phonemesPath);
             //Load models
             var linguisticModelPath = Path.Join(rootPath, dsConfig.linguistic);
             try {
@@ -106,6 +122,29 @@ protected virtual IG2p LoadG2p(string rootPath) {
             return new G2pFallbacks(g2ps.ToArray());
         }
 
+        //Check if the phoneme is supported. If unsupported, return an empty string.
+        //And apply language prefix to phoneme
+        string ValidatePhoneme(string phoneme){
+            if(g2p.IsValidSymbol(phoneme)){
+                return phoneme;
+            }
+            var langCode = GetLangCode();
+            if(langCode != String.Empty){
+                var phonemeWithLanguage = langCode + "/" + phoneme;
+                if(g2p.IsValidSymbol(phonemeWithLanguage)){
+                    return phonemeWithLanguage;
+                }
+            }
+            return String.Empty;
+        }
+
+        string[] ParsePhoneticHint(string phoneticHint) {
+            return phoneticHint.Split()
+                .Select(ValidatePhoneme)
+                .Where(s => !String.IsNullOrEmpty(s)) // skip invalid symbols.
+                .ToArray();
+        }
+
         string[] GetSymbols(Note note) {
             //priority:
             //1. phonetic hint
@@ -114,20 +153,16 @@ string[] GetSymbols(Note note) {
             //4. empty
             if (!string.IsNullOrEmpty(note.phoneticHint)) {
                 // Split space-separated symbols into an array.
-                return note.phoneticHint.Split()
-                    .Where(s => g2p.IsValidSymbol(s)) // skip the invalid symbols.
-                    .ToArray();
+                return ParsePhoneticHint(note.phoneticHint);
             }
             // User has not provided hint, query g2p dictionary.
             var g2presult = g2p.Query(note.lyric)
                 ?? g2p.Query(note.lyric.ToLowerInvariant());
             if(g2presult != null) {
                 return g2presult;
             }
-            //not founded in g2p dictionary, treat lyric as phonetic hint
-            var lyricSplited = note.lyric.Split()
-                    .Where(s => g2p.IsValidSymbol(s)) // skip the invalid symbols.
-                    .ToArray();
+            //not found in g2p dictionary, treat lyric as phonetic hint
+            var lyricSplited = ParsePhoneticHint(note.lyric);
             if (lyricSplited.Length > 0) {
                 return lyricSplited;
             }
@@ -168,6 +203,7 @@ List<phonemesPerNote> ProcessWord(Note[] notes, string[] symbols){
             }
             for(int i=0; i<dsPhonemes.Length; i++){
                 if(isVowel[i]){
+                    //In "Consonant-Glide-Vowel" syllable, the glide phoneme is the first phoneme in the note's timespan.
                     if(i>=2 && isGlide[i-1] && !isVowel[i-2]){
                         isStart[i-1] = true;
                     }else{
@@ -228,11 +264,11 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
         }
 
         int PhonemeTokenize(string phoneme){
-            int result = phonemes.IndexOf(phoneme);
-            if(result < 0){
+            bool success = phonemeTokens.TryGetValue(phoneme, out int token);
+            if(!success){
                 throw new Exception($"Phoneme \"{phoneme}\" isn't supported by timing model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
             }
-            return result;
+            return token;
         }
 
         protected override void ProcessPart(Note[][] phrase) {
@@ -290,6 +326,16 @@ protected override void ProcessPart(Note[][] phrase) {
             linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur",
                 new DenseTensor<Int64>(word_dur, new int[] { word_dur.Length }, false)
                 .Reshape(new int[] { 1, word_dur.Length })));
+            //Language id
+            if(dsConfig.use_lang_id){
+                var langIdByPhone = phrasePhonemes
+                    .SelectMany(n => n.Phonemes)
+                    .Select(p => (long)languageIds.GetValueOrDefault(p.Language(), 0))
+                    .ToArray();
+                var langIdTensor = new DenseTensor<Int64>(langIdByPhone, new int[] { langIdByPhone.Length }, false)
+                    .Reshape(new int[] { 1, langIdByPhone.Length });
+                linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("languages", langIdTensor));
+            }
             Onnx.VerifyInputNames(linguisticModel, linguisticInputs);
             var linguisticCache = Preferences.Default.DiffSingerTensorCache
                 ? new DiffSingerCache(linguisticHash, linguisticInputs)
@@ -393,6 +439,10 @@ public dsPhoneme(string symbol, string speaker){
             Symbol = symbol;
             Speaker = speaker;
         }
+
+        public string Language(){
+            return DiffSingerUtils.PhonemeLanguage(Symbol);
+        }
     }
 
     class phonemesPerNote{

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs b/OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
@@ -16,6 +16,7 @@ public class AugmentationArgs {
     [Serializable]
     public class DsConfig {
         public string phonemes = "phonemes.txt";
+        public string languages;
         public string acoustic;
         public string vocoder;
         public List<string> speakers;
@@ -28,6 +29,7 @@ public class DsConfig {
         public bool useTensionEmbed = false;
         public AugmentationArgs augmentationArgs;
         public bool useContinuousAcceleration = false;
+        public bool use_lang_id = false;
         [YamlMember(Alias = "use_shallow_diffusion")] public bool? _useShallowDiffusion;
         [YamlMember(Alias = "use_variable_depth")] public bool? _useVariableDepth;
         [YamlIgnore]

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs b/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs
@@ -2,8 +2,6 @@
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
-using System.Runtime.CompilerServices;
-using System.Text;
 using K4os.Hash.xxHash;
 using Microsoft.ML.OnnxRuntime;
 using Microsoft.ML.OnnxRuntime.Tensors;
@@ -19,7 +17,8 @@ public class DsPitch : IDisposable
     {
         string rootPath;
         DsConfig dsConfig;
-        List<string> phonemes;
+        Dictionary<string, int> languageIds = new Dictionary<string, int>();
+        Dictionary<string, int> phonemeTokens;
         ulong linguisticHash;
         InferenceSession linguisticModel;
         InferenceSession pitchModel;
@@ -39,9 +38,23 @@ public DsPitch(string rootPath)
             if(dsConfig.pitch == null){
                 throw new Exception("This voicebank doesn't contain a pitch model");
             }
+            //Load language id if needed
+            if(dsConfig.use_lang_id){
+                if(dsConfig.languages == null){
+                    Log.Error("\"languages\" field is not specified in dsconfig.yaml");
+                    return;
+                }
+                var langIdPath = Path.Join(rootPath, dsConfig.languages);
+                try {
+                    languageIds = DiffSingerUtils.LoadLanguageIds(langIdPath);
+                } catch (Exception e) {
+                    Log.Error(e, $"failed to load language id from {langIdPath}");
+                    return;
+                }
+            }
             //Load phonemes list
             string phonemesPath = Path.Combine(rootPath, dsConfig.phonemes);
-            phonemes = File.ReadLines(phonemesPath, Encoding.UTF8).ToList();
+            phonemeTokens = DiffSingerUtils.LoadPhonemes(phonemesPath);
             //Load models
             var linguisticModelPath = Path.Join(rootPath, dsConfig.linguistic);
             var linguisticModelBytes = File.ReadAllBytes(linguisticModelPath);
@@ -81,11 +94,11 @@ void SetRange<T>(T[] list, T value, int startIndex, int endIndex){
         }
 
         int PhonemeTokenize(string phoneme){
-            int result = phonemes.IndexOf(phoneme);
-            if(result < 0){
+            bool success = phonemeTokens.TryGetValue(phoneme, out int token);
+            if(!success){
                 throw new Exception($"Phoneme \"{phoneme}\" isn't supported by pitch model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
             }
-            return result;
+            return token;
         }
 
         public RenderPitchResult Process(RenderPhrase phrase){
@@ -133,12 +146,25 @@ public RenderPitchResult Process(RenderPhrase phrase){
                 linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur",
                     new DenseTensor<Int64>(word_dur, new int[] { word_dur.Length }, false)
                     .Reshape(new int[] { 1, word_dur.Length })));
-            }else{
+            } else {
                 //if predict_dur is false, use phoneme encode mode
                 linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("ph_dur",
                     new DenseTensor<Int64>(ph_dur.Select(x=>(Int64)x).ToArray(), new int[] { ph_dur.Length }, false)
                     .Reshape(new int[] { 1, ph_dur.Length })));
             }
+            //Language id
+            if(dsConfig.use_lang_id){
+                var langIdByPhone = phrase.phones
+                    .Select(p => (long)languageIds.GetValueOrDefault(
+                        DiffSingerUtils.PhonemeLanguage(p.phoneme),0
+                        ))
+                    .Prepend(0)
+                    .Append(0)
+                    .ToArray();
+                var langIdTensor = new DenseTensor<Int64>(langIdByPhone, new int[] { langIdByPhone.Length }, false)
+                    .Reshape(new int[] { 1, langIdByPhone.Length });
+                linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("languages", langIdTensor));
+            }
 
             Onnx.VerifyInputNames(linguisticModel, linguisticInputs);
             var linguisticCache = Preferences.Default.DiffSingerTensorCache

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
@@ -258,7 +258,19 @@ float[] InvokeDiffsinger(RenderPhrase phrase, double depth, int steps, Cancellat
                 acousticInputs.Add(NamedOnnxValue.CreateFromTensor("speedup",
                     new DenseTensor<long>(new long[] { speedup }, new int[] { 1 }, false)));
             }
-
+            //Language id
+            if(singer.dsConfig.use_lang_id){
+                var langIdByPhone = phrase.phones
+                    .Select(p => (long)singer.languageIds.GetValueOrDefault(
+                        DiffSingerUtils.PhonemeLanguage(p.phoneme),0
+                        ))
+                    .Prepend(0)
+                    .Append(0)
+                    .ToArray();
+                var langIdTensor = new DenseTensor<Int64>(langIdByPhone, new int[] { langIdByPhone.Length }, false)
+                    .Reshape(new int[] { 1, langIdByPhone.Length });
+                acousticInputs.Add(NamedOnnxValue.CreateFromTensor("languages", langIdTensor));
+            }
             //speaker
             if(singer.dsConfig.speakers != null) {
                 var speakerEmbedManager = singer.getSpeakerEmbedManager();

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs b/OpenUtau.Core/DiffSinger/DiffSingerSinger.cs
@@ -41,8 +41,9 @@ class DiffSingerSinger : USinger {
         List<USubbank> subbanks = new List<USubbank>();
         List<UOto> otos = new List<UOto>();
         Dictionary<string, UOto> otoMap = new Dictionary<string, UOto>();
-
         public List<string> phonemes = new List<string>();
+        Dictionary<string, int> phonemeTokens;
+        public Dictionary<string, int> languageIds = new Dictionary<string, int>();
         public DsConfig dsConfig;
         public ulong acousticHash;
         public InferenceSession acousticSession = null;
@@ -94,14 +95,29 @@ public DiffSingerSinger(Voicebank voicebank) {
             string phonemesPath = Path.Combine(Location, dsConfig.phonemes);
             if(phonemesPath != null && File.Exists(phonemesPath)){
                 try {
-                    phonemes = File.ReadLines(phonemesPath, TextFileEncoding).ToList();
+                    phonemeTokens = DiffSingerUtils.LoadPhonemes(phonemesPath);
+                    phonemes = phonemeTokens.Keys.ToList();
                 } catch (Exception e){
                     Log.Error(e, $"Failed to load phoneme list for {Name} from {phonemesPath}");
                 }
             } else {
                 Log.Error($"phonemes file not found for {Name} at {phonemesPath}");
             }
 
+            //Load language Id if needed
+            if(dsConfig.use_lang_id){
+                if(dsConfig.languages == null){
+                    Log.Error("\"languages\" field is not specified in dsconfig.yaml");
+                } else {
+                var langIdPath = Path.Join(Location, dsConfig.languages);
+                    try {
+                        languageIds = DiffSingerUtils.LoadLanguageIds(langIdPath);
+                    } catch (Exception e) {
+                        Log.Error(e, $"failed to load language id from {langIdPath}");
+                    }
+                }
+            }
+
             var dummyOtoSet = new UOtoSet(new OtoSet(), Location);
             foreach (var phone in phonemes) {
                 var uOto = UOto.OfDummy(phone);
@@ -194,11 +210,11 @@ public DsVariance getVariancePredictor(){
         }
 
         public int PhonemeTokenize(string phoneme){
-            int result = phonemes.IndexOf(phoneme);
-            if(result < 0){
+            bool success = phonemeTokens.TryGetValue(phoneme, out int token);
+            if(!success){
                 throw new Exception($"Phoneme \"{phoneme}\" isn't supported by acoustic model. Please check {Path.Combine(Location, dsConfig.phonemes)}");
             }
-            return result;
+            return token;
         }
 
         public override void FreeMemory(){

diff --git a/OpenUtau.Core/DiffSinger/DiffSingerUtils.cs b/OpenUtau.Core/DiffSinger/DiffSingerUtils.cs
@@ -1,5 +1,9 @@
 using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
 using Microsoft.ML.OnnxRuntime.Tensors;
+using Newtonsoft.Json;
 using OpenUtau.Core.Render;
 
 namespace OpenUtau.Core.DiffSinger {
@@ -95,5 +99,40 @@ public static string ShapeString<T>(Tensor<T> tensor){
             var shape = tensor.Dimensions;
             return "(" + string.Join(", ", shape.ToArray()) + ")";
         }
+
+        public static Dictionary<string, int> LoadPhonemes(string filePath){
+            switch(Path.GetExtension(filePath).ToLower()){
+                case ".json":
+                    return LoadPhonemesFromJson(filePath);
+                default:
+                    return LoadPhonemesFromTxt(filePath);
+            }
+        }
+
+        static Dictionary<string, int> LoadPhonemesFromJson(string filePath){
+            var json = File.ReadAllText(filePath, Encoding.UTF8);
+            return JsonConvert.DeserializeObject<Dictionary<string, int>>(json);
+        }
+
+        static Dictionary<string, int> LoadPhonemesFromTxt(string filePath){
+            var lines = File.ReadAllLines(filePath, Encoding.UTF8);
+            var result = new Dictionary<string, int>();
+            for (int i = 0; i < lines.Length; i++) {
+                result[lines[i]] = i;
+            }
+            return result;
+        }
+
+        public static Dictionary<string, int> LoadLanguageIds(string filePath){
+            var json = File.ReadAllText(filePath, Encoding.UTF8);
+            return JsonConvert.DeserializeObject<Dictionary<string, int>>(json);
+        }
+
+        public static string PhonemeLanguage(string phoneme){
+            if(phoneme.Contains("/")){
+                return phoneme.Split("/")[0];
+            }
+            return "";
+        }
     }
 }