Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev cosyvoice #242

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions basereal.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
from threading import Thread, Event
from io import BytesIO
import soundfile as sf

import av
from fractions import Fraction

from ttsreal import EdgeTTS,VoitsTTS,XTTS
from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS


from tqdm import tqdm
def read_imgs(img_list):
Expand All @@ -41,6 +41,10 @@ def __init__(self, opt):
self.tts = VoitsTTS(opt,self)
elif opt.tts == "xtts":
self.tts = XTTS(opt,self)

elif opt.tts == "cosyvoice":
self.tts = CosyVoiceTTS(opt,self)


self.recording = False
self.recordq_video = Queue()
Expand Down
Empty file added cosyvoice/__init__.py
Empty file.
83 changes: 83 additions & 0 deletions cosyvoice/cosyvoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
from hyperpyyaml import load_hyperpyyaml
from modelscope import snapshot_download
from .frontend import CosyVoiceFrontEnd
from .model import CosyVoiceModel

class CosyVoice:

def __init__(self, model_dir):
instruct = True if '-Instruct' in model_dir else False
self.model_dir = model_dir
if not os.path.exists(model_dir):
model_dir = snapshot_download(model_dir)
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
configs = load_hyperpyyaml(f)
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
configs['feat_extractor'],
'{}/campplus.onnx'.format(model_dir),
'{}/speech_tokenizer_v1.onnx'.format(model_dir),
'{}/spk2info.pt'.format(model_dir),
instruct,
configs['allowed_special'])
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
self.model.load('{}/llm.pt'.format(model_dir),
'{}/flow.pt'.format(model_dir),
'{}/hift.pt'.format(model_dir))
del configs

def list_avaliable_spks(self):
spks = list(self.frontend.spk2info.keys())
return spks

def inference_sft(self, tts_text, spk_id):
tts_speeches = []
for i in self.frontend.text_normalize(tts_text, split=True):
model_input = self.frontend.frontend_sft(i, spk_id)
model_output = self.model.inference(**model_input)
tts_speeches.append(model_output['tts_speech'])
return {'tts_speech': torch.concat(tts_speeches, dim=1)}

def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
tts_speeches = []
for i in self.frontend.text_normalize(tts_text, split=True):
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
model_output = self.model.inference(**model_input)
tts_speeches.append(model_output['tts_speech'])
return {'tts_speech': torch.concat(tts_speeches, dim=1)}

def inference_cross_lingual(self, tts_text, prompt_speech_16k):
if self.frontend.instruct is True:
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
tts_speeches = []
for i in self.frontend.text_normalize(tts_text, split=True):
model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
model_output = self.model.inference(**model_input)
tts_speeches.append(model_output['tts_speech'])
return {'tts_speech': torch.concat(tts_speeches, dim=1)}

def inference_instruct(self, tts_text, spk_id, instruct_text):
if self.frontend.instruct is False:
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
instruct_text = self.frontend.text_normalize(instruct_text, split=False)
tts_speeches = []
for i in self.frontend.text_normalize(tts_text, split=True):
model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
model_output = self.model.inference(**model_input)
tts_speeches.append(model_output['tts_speech'])
return {'tts_speech': torch.concat(tts_speeches, dim=1)}
Empty file added cosyvoice/dataset/__init__.py
Empty file.
Loading