Skip to content

Commit

Permalink
Merge pull request #1248 from oxygen-dioxide/diffsinger
Browse files Browse the repository at this point in the history
Support Diffsinger multi-dictionary
  • Loading branch information
stakira authored Sep 1, 2024
2 parents f2fbda0 + d49a9af commit 9d574ef
Show file tree
Hide file tree
Showing 18 changed files with 216 additions and 31 deletions.
74 changes: 62 additions & 12 deletions OpenUtau.Core/DiffSinger/DiffSingerBasePhonemizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,20 @@ public abstract class DiffSingerBasePhonemizer : MachineLearningPhonemizer
{
USinger singer;
DsConfig dsConfig;
Dictionary<string, int>languageIds = new Dictionary<string, int>();
string rootPath;
float frameMs;
ulong linguisticHash;
ulong durationHash;
InferenceSession linguisticModel;
InferenceSession durationModel;
IG2p g2p;
List<string> phonemes;
Dictionary<string, int> phonemeTokens;
DiffSingerSpeakerEmbedManager speakerEmbedManager;

string defaultPause = "SP";
protected virtual string GetDictionaryName()=>"dsdict.yaml";
protected virtual string GetLangCode()=>String.Empty;//The language code of the language the phonemizer is made for

public override void SetSinger(USinger singer) {
this.singer = singer;
Expand All @@ -53,12 +55,26 @@ public override void SetSinger(USinger singer) {
Log.Error(e, $"failed to load dsconfig from {configPath}");
return;
}
//Load language id if needed
if(dsConfig.use_lang_id){
if(dsConfig.languages == null){
Log.Error("\"languages\" field is not specified in dsconfig.yaml");
return;
}
var langIdPath = Path.Join(rootPath, dsConfig.languages);
try {
languageIds = DiffSingerUtils.LoadLanguageIds(langIdPath);
} catch (Exception e) {
Log.Error(e, $"failed to load language id from {langIdPath}");
return;
}
}
this.frameMs = dsConfig.frameMs();
//Load g2p
g2p = LoadG2p(rootPath);
//Load phonemes list
string phonemesPath = Path.Combine(rootPath, dsConfig.phonemes);
phonemes = File.ReadLines(phonemesPath,singer.TextFileEncoding).ToList();
phonemeTokens = DiffSingerUtils.LoadPhonemes(phonemesPath);
//Load models
var linguisticModelPath = Path.Join(rootPath, dsConfig.linguistic);
try {
Expand Down Expand Up @@ -106,6 +122,29 @@ protected virtual IG2p LoadG2p(string rootPath) {
return new G2pFallbacks(g2ps.ToArray());
}

//Check if the phoneme is supported. If unsupported, return an empty string.
//And apply language prefix to phoneme
string ValidatePhoneme(string phoneme){
if(g2p.IsValidSymbol(phoneme)){
return phoneme;
}
var langCode = GetLangCode();
if(langCode != String.Empty){
var phonemeWithLanguage = langCode + "/" + phoneme;
if(g2p.IsValidSymbol(phonemeWithLanguage)){
return phonemeWithLanguage;
}
}
return String.Empty;
}

string[] ParsePhoneticHint(string phoneticHint) {
return phoneticHint.Split()
.Select(ValidatePhoneme)
.Where(s => !String.IsNullOrEmpty(s)) // skip invalid symbols.
.ToArray();
}

string[] GetSymbols(Note note) {
//priority:
//1. phonetic hint
Expand All @@ -114,20 +153,16 @@ string[] GetSymbols(Note note) {
//4. empty
if (!string.IsNullOrEmpty(note.phoneticHint)) {
// Split space-separated symbols into an array.
return note.phoneticHint.Split()
.Where(s => g2p.IsValidSymbol(s)) // skip the invalid symbols.
.ToArray();
return ParsePhoneticHint(note.phoneticHint);
}
// User has not provided hint, query g2p dictionary.
var g2presult = g2p.Query(note.lyric)
?? g2p.Query(note.lyric.ToLowerInvariant());
if(g2presult != null) {
return g2presult;
}
//not founded in g2p dictionary, treat lyric as phonetic hint
var lyricSplited = note.lyric.Split()
.Where(s => g2p.IsValidSymbol(s)) // skip the invalid symbols.
.ToArray();
//not found in g2p dictionary, treat lyric as phonetic hint
var lyricSplited = ParsePhoneticHint(note.lyric);
if (lyricSplited.Length > 0) {
return lyricSplited;
}
Expand Down Expand Up @@ -168,6 +203,7 @@ List<phonemesPerNote> ProcessWord(Note[] notes, string[] symbols){
}
for(int i=0; i<dsPhonemes.Length; i++){
if(isVowel[i]){
//In "Consonant-Glide-Vowel" syllable, the glide phoneme is the first phoneme in the note's timespan.
if(i>=2 && isGlide[i-1] && !isVowel[i-2]){
isStart[i-1] = true;
}else{
Expand Down Expand Up @@ -228,11 +264,11 @@ public DiffSingerSpeakerEmbedManager getSpeakerEmbedManager(){
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
bool success = phonemeTokens.TryGetValue(phoneme, out int token);
if(!success){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by timing model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
return token;
}

protected override void ProcessPart(Note[][] phrase) {
Expand Down Expand Up @@ -290,6 +326,16 @@ protected override void ProcessPart(Note[][] phrase) {
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur",
new DenseTensor<Int64>(word_dur, new int[] { word_dur.Length }, false)
.Reshape(new int[] { 1, word_dur.Length })));
//Language id
if(dsConfig.use_lang_id){
var langIdByPhone = phrasePhonemes
.SelectMany(n => n.Phonemes)
.Select(p => (long)languageIds.GetValueOrDefault(p.Language(), 0))
.ToArray();
var langIdTensor = new DenseTensor<Int64>(langIdByPhone, new int[] { langIdByPhone.Length }, false)
.Reshape(new int[] { 1, langIdByPhone.Length });
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("languages", langIdTensor));
}
Onnx.VerifyInputNames(linguisticModel, linguisticInputs);
var linguisticCache = Preferences.Default.DiffSingerTensorCache
? new DiffSingerCache(linguisticHash, linguisticInputs)
Expand Down Expand Up @@ -393,6 +439,10 @@ public dsPhoneme(string symbol, string speaker){
Symbol = symbol;
Speaker = speaker;
}

public string Language(){
return DiffSingerUtils.PhonemeLanguage(Symbol);
}
}

class phonemesPerNote{
Expand Down
2 changes: 2 additions & 0 deletions OpenUtau.Core/DiffSinger/DiffSingerConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public class AugmentationArgs {
[Serializable]
public class DsConfig {
public string phonemes = "phonemes.txt";
public string languages;
public string acoustic;
public string vocoder;
public List<string> speakers;
Expand All @@ -28,6 +29,7 @@ public class DsConfig {
public bool useTensionEmbed = false;
public AugmentationArgs augmentationArgs;
public bool useContinuousAcceleration = false;
public bool use_lang_id = false;
[YamlMember(Alias = "use_shallow_diffusion")] public bool? _useShallowDiffusion;
[YamlMember(Alias = "use_variable_depth")] public bool? _useVariableDepth;
[YamlIgnore]
Expand Down
42 changes: 34 additions & 8 deletions OpenUtau.Core/DiffSinger/DiffSingerPitch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using K4os.Hash.xxHash;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
Expand All @@ -19,7 +17,8 @@ public class DsPitch : IDisposable
{
string rootPath;
DsConfig dsConfig;
List<string> phonemes;
Dictionary<string, int> languageIds = new Dictionary<string, int>();
Dictionary<string, int> phonemeTokens;
ulong linguisticHash;
InferenceSession linguisticModel;
InferenceSession pitchModel;
Expand All @@ -39,9 +38,23 @@ public DsPitch(string rootPath)
if(dsConfig.pitch == null){
throw new Exception("This voicebank doesn't contain a pitch model");
}
//Load language id if needed
if(dsConfig.use_lang_id){
if(dsConfig.languages == null){
Log.Error("\"languages\" field is not specified in dsconfig.yaml");
return;
}
var langIdPath = Path.Join(rootPath, dsConfig.languages);
try {
languageIds = DiffSingerUtils.LoadLanguageIds(langIdPath);
} catch (Exception e) {
Log.Error(e, $"failed to load language id from {langIdPath}");
return;
}
}
//Load phonemes list
string phonemesPath = Path.Combine(rootPath, dsConfig.phonemes);
phonemes = File.ReadLines(phonemesPath, Encoding.UTF8).ToList();
phonemeTokens = DiffSingerUtils.LoadPhonemes(phonemesPath);
//Load models
var linguisticModelPath = Path.Join(rootPath, dsConfig.linguistic);
var linguisticModelBytes = File.ReadAllBytes(linguisticModelPath);
Expand Down Expand Up @@ -81,11 +94,11 @@ void SetRange<T>(T[] list, T value, int startIndex, int endIndex){
}

int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
bool success = phonemeTokens.TryGetValue(phoneme, out int token);
if(!success){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by pitch model. Please check {Path.Combine(rootPath, dsConfig.phonemes)}");
}
return result;
return token;
}

public RenderPitchResult Process(RenderPhrase phrase){
Expand Down Expand Up @@ -133,12 +146,25 @@ public RenderPitchResult Process(RenderPhrase phrase){
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("word_dur",
new DenseTensor<Int64>(word_dur, new int[] { word_dur.Length }, false)
.Reshape(new int[] { 1, word_dur.Length })));
}else{
} else {
//if predict_dur is false, use phoneme encode mode
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("ph_dur",
new DenseTensor<Int64>(ph_dur.Select(x=>(Int64)x).ToArray(), new int[] { ph_dur.Length }, false)
.Reshape(new int[] { 1, ph_dur.Length })));
}
//Language id
if(dsConfig.use_lang_id){
var langIdByPhone = phrase.phones
.Select(p => (long)languageIds.GetValueOrDefault(
DiffSingerUtils.PhonemeLanguage(p.phoneme),0
))
.Prepend(0)
.Append(0)
.ToArray();
var langIdTensor = new DenseTensor<Int64>(langIdByPhone, new int[] { langIdByPhone.Length }, false)
.Reshape(new int[] { 1, langIdByPhone.Length });
linguisticInputs.Add(NamedOnnxValue.CreateFromTensor("languages", langIdTensor));
}

Onnx.VerifyInputNames(linguisticModel, linguisticInputs);
var linguisticCache = Preferences.Default.DiffSingerTensorCache
Expand Down
14 changes: 13 additions & 1 deletion OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,19 @@ float[] InvokeDiffsinger(RenderPhrase phrase, double depth, int steps, Cancellat
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("speedup",
new DenseTensor<long>(new long[] { speedup }, new int[] { 1 }, false)));
}

//Language id
if(singer.dsConfig.use_lang_id){
var langIdByPhone = phrase.phones
.Select(p => (long)singer.languageIds.GetValueOrDefault(
DiffSingerUtils.PhonemeLanguage(p.phoneme),0
))
.Prepend(0)
.Append(0)
.ToArray();
var langIdTensor = new DenseTensor<Int64>(langIdByPhone, new int[] { langIdByPhone.Length }, false)
.Reshape(new int[] { 1, langIdByPhone.Length });
acousticInputs.Add(NamedOnnxValue.CreateFromTensor("languages", langIdTensor));
}
//speaker
if(singer.dsConfig.speakers != null) {
var speakerEmbedManager = singer.getSpeakerEmbedManager();
Expand Down
26 changes: 21 additions & 5 deletions OpenUtau.Core/DiffSinger/DiffSingerSinger.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ class DiffSingerSinger : USinger {
List<USubbank> subbanks = new List<USubbank>();
List<UOto> otos = new List<UOto>();
Dictionary<string, UOto> otoMap = new Dictionary<string, UOto>();

public List<string> phonemes = new List<string>();
Dictionary<string, int> phonemeTokens;
public Dictionary<string, int> languageIds = new Dictionary<string, int>();
public DsConfig dsConfig;
public ulong acousticHash;
public InferenceSession acousticSession = null;
Expand Down Expand Up @@ -94,14 +95,29 @@ public DiffSingerSinger(Voicebank voicebank) {
string phonemesPath = Path.Combine(Location, dsConfig.phonemes);
if(phonemesPath != null && File.Exists(phonemesPath)){
try {
phonemes = File.ReadLines(phonemesPath, TextFileEncoding).ToList();
phonemeTokens = DiffSingerUtils.LoadPhonemes(phonemesPath);
phonemes = phonemeTokens.Keys.ToList();
} catch (Exception e){
Log.Error(e, $"Failed to load phoneme list for {Name} from {phonemesPath}");
}
} else {
Log.Error($"phonemes file not found for {Name} at {phonemesPath}");
}

//Load language Id if needed
if(dsConfig.use_lang_id){
if(dsConfig.languages == null){
Log.Error("\"languages\" field is not specified in dsconfig.yaml");
} else {
var langIdPath = Path.Join(Location, dsConfig.languages);
try {
languageIds = DiffSingerUtils.LoadLanguageIds(langIdPath);
} catch (Exception e) {
Log.Error(e, $"failed to load language id from {langIdPath}");
}
}
}

var dummyOtoSet = new UOtoSet(new OtoSet(), Location);
foreach (var phone in phonemes) {
var uOto = UOto.OfDummy(phone);
Expand Down Expand Up @@ -194,11 +210,11 @@ public DsVariance getVariancePredictor(){
}

public int PhonemeTokenize(string phoneme){
int result = phonemes.IndexOf(phoneme);
if(result < 0){
bool success = phonemeTokens.TryGetValue(phoneme, out int token);
if(!success){
throw new Exception($"Phoneme \"{phoneme}\" isn't supported by acoustic model. Please check {Path.Combine(Location, dsConfig.phonemes)}");
}
return result;
return token;
}

public override void FreeMemory(){
Expand Down
39 changes: 39 additions & 0 deletions OpenUtau.Core/DiffSinger/DiffSingerUtils.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using Microsoft.ML.OnnxRuntime.Tensors;
using Newtonsoft.Json;
using OpenUtau.Core.Render;

namespace OpenUtau.Core.DiffSinger {
Expand Down Expand Up @@ -95,5 +99,40 @@ public static string ShapeString<T>(Tensor<T> tensor){
var shape = tensor.Dimensions;
return "(" + string.Join(", ", shape.ToArray()) + ")";
}

public static Dictionary<string, int> LoadPhonemes(string filePath){
switch(Path.GetExtension(filePath).ToLower()){
case ".json":
return LoadPhonemesFromJson(filePath);
default:
return LoadPhonemesFromTxt(filePath);
}
}

static Dictionary<string, int> LoadPhonemesFromJson(string filePath){
var json = File.ReadAllText(filePath, Encoding.UTF8);
return JsonConvert.DeserializeObject<Dictionary<string, int>>(json);
}

static Dictionary<string, int> LoadPhonemesFromTxt(string filePath){
var lines = File.ReadAllLines(filePath, Encoding.UTF8);
var result = new Dictionary<string, int>();
for (int i = 0; i < lines.Length; i++) {
result[lines[i]] = i;
}
return result;
}

public static Dictionary<string, int> LoadLanguageIds(string filePath){
var json = File.ReadAllText(filePath, Encoding.UTF8);
return JsonConvert.DeserializeObject<Dictionary<string, int>>(json);
}

public static string PhonemeLanguage(string phoneme){
if(phoneme.Contains("/")){
return phoneme.Split("/")[0];
}
return "";
}
}
}
Loading

0 comments on commit 9d574ef

Please sign in to comment.