diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 88cc8e7949..cdb30ea0e0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -55,6 +55,7 @@ jobs: - name: Upload coverage data uses: actions/upload-artifact@v4 with: + include-hidden-files: true name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }} path: .coverage.* if-no-files-found: ignore diff --git a/TTS/.models.json b/TTS/.models.json index a77ebea1cf..a5add6e34f 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -48,7 +48,6 @@ "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt", "https://coqui.gateway.scarf.sh/hf/bark/text_2.pt", "https://coqui.gateway.scarf.sh/hf/bark/config.json", - "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt", "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth" ], "default_vocoder": null, diff --git a/TTS/__init__.py b/TTS/__init__.py index 9e87bca4be..64c7369bc0 100644 --- a/TTS/__init__.py +++ b/TTS/__init__.py @@ -1,3 +1,29 @@ +import _codecs import importlib.metadata +from collections import defaultdict + +import numpy as np +import torch + +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig +from TTS.utils.radam import RAdam __version__ = importlib.metadata.version("coqui-tts") + + +torch.serialization.add_safe_globals([dict, defaultdict, RAdam]) + +# Bark +torch.serialization.add_safe_globals( + [ + np.core.multiarray.scalar, + np.dtype, + np.dtypes.Float64DType, + _codecs.encode, # TODO: safe by default from Pytorch 2.5 + ] +) + +# XTTS +torch.serialization.add_safe_globals([BaseDatasetConfig, XttsConfig, XttsAudioConfig, XttsArgs]) diff --git a/TTS/tts/configs/bark_config.py b/TTS/tts/configs/bark_config.py index 3b893558aa..b846febe85 100644 --- a/TTS/tts/configs/bark_config.py +++ b/TTS/tts/configs/bark_config.py @@ -96,7 +96,6 @@ def __post_init__(self): "coarse": os.path.join(self.CACHE_DIR, "coarse_2.pt"), "fine": os.path.join(self.CACHE_DIR, "fine_2.pt"), "hubert_tokenizer": os.path.join(self.CACHE_DIR, "tokenizer.pth"), - "hubert": os.path.join(self.CACHE_DIR, "hubert.pt"), } self.SMALL_REMOTE_MODEL_PATHS = { "text": {"path": os.path.join(self.REMOTE_BASE_URL, "text.pt")}, diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py index 9e487b1e9d..58a614cb87 100644 --- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py +++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py @@ -40,7 +40,7 @@ class CustomHubert(nn.Module): or you can train your own """ - def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None): + def __init__(self, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None): super().__init__() self.target_sample_hz = target_sample_hz self.seq_len_multiple_of = seq_len_multiple_of diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py index b2875c7a83..65c7800dcf 100644 --- a/TTS/tts/layers/bark/inference_funcs.py +++ b/TTS/tts/layers/bark/inference_funcs.py @@ -134,10 +134,9 @@ def generate_voice( # generate semantic tokens # Load the HuBERT model hubert_manager = HubertManager() - # hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"]) hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"]) - hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device) + hubert_model = CustomHubert().to(model.device) # Load the CustomTokenizer model tokenizer = HubertTokenizer.load_from_checkpoint( diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py index ce6b757f05..7785aab845 100644 --- a/TTS/tts/layers/bark/load_model.py +++ b/TTS/tts/layers/bark/load_model.py @@ -118,7 +118,7 @@ def load_model(ckpt_path, device, config, model_type="text"): logger.info(f"{model_type} model not found, downloading...") _download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR) - checkpoint = torch.load(ckpt_path, map_location=device) + checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True) # this is a hack model_args = checkpoint["model_args"] if "input_vocab_size" not in model_args: diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py index c79ef31b0c..f4dbcc8054 100644 --- a/TTS/tts/layers/tortoise/arch_utils.py +++ b/TTS/tts/layers/tortoise/arch_utils.py @@ -332,7 +332,7 @@ def __init__( self.mel_norm_file = mel_norm_file if self.mel_norm_file is not None: with fsspec.open(self.mel_norm_file) as f: - self.mel_norms = torch.load(f) + self.mel_norms = torch.load(f, weights_only=True) else: self.mel_norms = None diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py index 0b8701227b..94c2bae6fa 100644 --- a/TTS/tts/layers/tortoise/audio_utils.py +++ b/TTS/tts/layers/tortoise/audio_utils.py @@ -124,7 +124,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []): voices = get_voices(extra_voice_dirs) paths = voices[voice] if len(paths) == 1 and paths[0].endswith(".pth"): - return None, torch.load(paths[0]) + return None, torch.load(paths[0], weights_only=True) else: conds = [] for cond_path in paths: diff --git a/TTS/tts/layers/xtts/dvae.py b/TTS/tts/layers/xtts/dvae.py index 4a37307e74..58f91785a1 100644 --- a/TTS/tts/layers/xtts/dvae.py +++ b/TTS/tts/layers/xtts/dvae.py @@ -46,7 +46,7 @@ def dvae_wav_to_mel( mel = mel_stft(wav) mel = torch.log(torch.clamp(mel, min=1e-5)) if mel_norms is None: - mel_norms = torch.load(mel_norms_file, map_location=device) + mel_norms = torch.load(mel_norms_file, map_location=device, weights_only=True) mel = mel / mel_norms.unsqueeze(0).unsqueeze(-1) return mel diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py index b6032e5584..09bd06dfde 100644 --- a/TTS/tts/layers/xtts/hifigan_decoder.py +++ b/TTS/tts/layers/xtts/hifigan_decoder.py @@ -328,7 +328,7 @@ def remove_weight_norm(self): def load_checkpoint( self, config, checkpoint_path, eval=False, cache=False ): # pylint: disable=unused-argument, redefined-builtin - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + state = torch.load(checkpoint_path, map_location=torch.device("cpu"), weights_only=True) self.load_state_dict(state["model"]) if eval: self.eval() diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 04d123778b..f1aa6f8cd0 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -91,7 +91,7 @@ def __init__(self, config: Coqpit): # load GPT if available if self.args.gpt_checkpoint: - gpt_checkpoint = torch.load(self.args.gpt_checkpoint, map_location=torch.device("cpu")) + gpt_checkpoint = torch.load(self.args.gpt_checkpoint, map_location=torch.device("cpu"), weights_only=True) # deal with coqui Trainer exported model if "model" in gpt_checkpoint.keys() and "config" in gpt_checkpoint.keys(): logger.info("Coqui Trainer checkpoint detected! Converting it!") @@ -184,7 +184,7 @@ def __init__(self, config: Coqpit): self.dvae.eval() if self.args.dvae_checkpoint: - dvae_checkpoint = torch.load(self.args.dvae_checkpoint, map_location=torch.device("cpu")) + dvae_checkpoint = torch.load(self.args.dvae_checkpoint, map_location=torch.device("cpu"), weights_only=True) self.dvae.load_state_dict(dvae_checkpoint, strict=False) logger.info("DVAE weights restored from: %s", self.args.dvae_checkpoint) else: diff --git a/TTS/tts/layers/xtts/xtts_manager.py b/TTS/tts/layers/xtts/xtts_manager.py index 5560e87687..5a3c47aead 100644 --- a/TTS/tts/layers/xtts/xtts_manager.py +++ b/TTS/tts/layers/xtts/xtts_manager.py @@ -3,7 +3,7 @@ class SpeakerManager: def __init__(self, speaker_file_path=None): - self.speakers = torch.load(speaker_file_path) + self.speakers = torch.load(speaker_file_path, weights_only=True) @property def name_to_id(self): diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py index cdfb5efae4..ced8f60ed8 100644 --- a/TTS/tts/models/bark.py +++ b/TTS/tts/models/bark.py @@ -243,7 +243,6 @@ def load_checkpoint( text_model_path=None, coarse_model_path=None, fine_model_path=None, - hubert_model_path=None, hubert_tokenizer_path=None, eval=False, strict=True, @@ -266,13 +265,11 @@ def load_checkpoint( text_model_path = text_model_path or os.path.join(checkpoint_dir, "text_2.pt") coarse_model_path = coarse_model_path or os.path.join(checkpoint_dir, "coarse_2.pt") fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt") - hubert_model_path = hubert_model_path or os.path.join(checkpoint_dir, "hubert.pt") hubert_tokenizer_path = hubert_tokenizer_path or os.path.join(checkpoint_dir, "tokenizer.pth") self.config.LOCAL_MODEL_PATHS["text"] = text_model_path self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path - self.config.LOCAL_MODEL_PATHS["hubert"] = hubert_model_path self.config.LOCAL_MODEL_PATHS["hubert_tokenizer"] = hubert_tokenizer_path self.load_bark_models() diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py index 277369e644..49c48c2bd4 100644 --- a/TTS/tts/models/neuralhmm_tts.py +++ b/TTS/tts/models/neuralhmm_tts.py @@ -107,7 +107,7 @@ def update_mean_std(self, statistics_dict: Dict): def preprocess_batch(self, text, text_len, mels, mel_len): if self.mean.item() == 0 or self.std.item() == 1: - statistics_dict = torch.load(self.mel_statistics_parameter_path) + statistics_dict = torch.load(self.mel_statistics_parameter_path, weights_only=True) self.update_mean_std(statistics_dict) mels = self.normalize(mels) @@ -292,7 +292,7 @@ def on_init_start(self, trainer): "Data parameters found for: %s. Loading mel normalization parameters...", trainer.config.mel_statistics_parameter_path, ) - statistics = torch.load(trainer.config.mel_statistics_parameter_path) + statistics = torch.load(trainer.config.mel_statistics_parameter_path, weights_only=True) data_mean, data_std, init_transition_prob = ( statistics["mean"], statistics["std"], diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py index b05b75009b..4c0f341be3 100644 --- a/TTS/tts/models/overflow.py +++ b/TTS/tts/models/overflow.py @@ -120,7 +120,7 @@ def update_mean_std(self, statistics_dict: Dict): def preprocess_batch(self, text, text_len, mels, mel_len): if self.mean.item() == 0 or self.std.item() == 1: - statistics_dict = torch.load(self.mel_statistics_parameter_path) + statistics_dict = torch.load(self.mel_statistics_parameter_path, weights_only=True) self.update_mean_std(statistics_dict) mels = self.normalize(mels) @@ -308,7 +308,7 @@ def on_init_start(self, trainer): "Data parameters found for: %s. Loading mel normalization parameters...", trainer.config.mel_statistics_parameter_path, ) - statistics = torch.load(trainer.config.mel_statistics_parameter_path) + statistics = torch.load(trainer.config.mel_statistics_parameter_path, weights_only=True) data_mean, data_std, init_transition_prob = ( statistics["mean"], statistics["std"], diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 17303c69f7..98e79d0cf1 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -170,7 +170,9 @@ def classify_audio_clip(clip, model_dir): kernel_size=5, distribute_zero_label=False, ) - classifier.load_state_dict(torch.load(os.path.join(model_dir, "classifier.pth"), map_location=torch.device("cpu"))) + classifier.load_state_dict( + torch.load(os.path.join(model_dir, "classifier.pth"), map_location=torch.device("cpu"), weights_only=True) + ) clip = clip.cpu().unsqueeze(0) results = F.softmax(classifier(clip), dim=-1) return results[0][0] @@ -488,6 +490,7 @@ def get_random_conditioning_latents(self): torch.load( os.path.join(self.models_dir, "rlg_auto.pth"), map_location=torch.device("cpu"), + weights_only=True, ) ) self.rlg_diffusion = RandomLatentConverter(2048).eval() @@ -495,6 +498,7 @@ def get_random_conditioning_latents(self): torch.load( os.path.join(self.models_dir, "rlg_diffuser.pth"), map_location=torch.device("cpu"), + weights_only=True, ) ) with torch.no_grad(): @@ -881,17 +885,17 @@ def load_checkpoint( if os.path.exists(ar_path): # remove keys from the checkpoint that are not in the model - checkpoint = torch.load(ar_path, map_location=torch.device("cpu")) + checkpoint = torch.load(ar_path, map_location=torch.device("cpu"), weights_only=True) # strict set False # due to removed `bias` and `masked_bias` changes in Transformers self.autoregressive.load_state_dict(checkpoint, strict=False) if os.path.exists(diff_path): - self.diffusion.load_state_dict(torch.load(diff_path), strict=strict) + self.diffusion.load_state_dict(torch.load(diff_path, weights_only=True), strict=strict) if os.path.exists(clvp_path): - self.clvp.load_state_dict(torch.load(clvp_path), strict=strict) + self.clvp.load_state_dict(torch.load(clvp_path, weights_only=True), strict=strict) if os.path.exists(vocoder_checkpoint_path): self.vocoder.load_state_dict( @@ -899,6 +903,7 @@ def load_checkpoint( torch.load( vocoder_checkpoint_path, map_location=torch.device("cpu"), + weights_only=True, ) ) ) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index ef09344217..0b7652e450 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -65,7 +65,7 @@ def wav_to_mel_cloning( mel = mel_stft(wav) mel = torch.log(torch.clamp(mel, min=1e-5)) if mel_norms is None: - mel_norms = torch.load(mel_norms_file, map_location=device) + mel_norms = torch.load(mel_norms_file, map_location=device, weights_only=True) mel = mel / mel_norms.unsqueeze(0).unsqueeze(-1) return mel diff --git a/TTS/tts/utils/fairseq.py b/TTS/tts/utils/fairseq.py index 3d8eec2b4e..6eb1905d96 100644 --- a/TTS/tts/utils/fairseq.py +++ b/TTS/tts/utils/fairseq.py @@ -2,7 +2,7 @@ def rehash_fairseq_vits_checkpoint(checkpoint_file): - chk = torch.load(checkpoint_file, map_location=torch.device("cpu"))["model"] + chk = torch.load(checkpoint_file, map_location=torch.device("cpu"), weights_only=True)["model"] new_chk = {} for k, v in chk.items(): if "enc_p." in k: diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py index 23aa52a8a2..6f72581c08 100644 --- a/TTS/tts/utils/managers.py +++ b/TTS/tts/utils/managers.py @@ -17,7 +17,7 @@ def load_file(path: str): return json.load(f) elif path.endswith(".pth"): with fsspec.open(path, "rb") as f: - return torch.load(f, map_location="cpu") + return torch.load(f, map_location="cpu", weights_only=True) else: raise ValueError("Unsupported file type") diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 50a7893047..90af4f48f9 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -12,9 +12,6 @@ from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.models.vits import Vits - -# pylint: disable=unused-wildcard-import -# pylint: disable=wildcard-import from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import save_wav diff --git a/TTS/vc/modules/freevc/wavlm/__init__.py b/TTS/vc/modules/freevc/wavlm/__init__.py index 03b2f5827b..528fade772 100644 --- a/TTS/vc/modules/freevc/wavlm/__init__.py +++ b/TTS/vc/modules/freevc/wavlm/__init__.py @@ -26,7 +26,7 @@ def get_wavlm(device="cpu"): logger.info("Downloading WavLM model to %s ...", output_path) urllib.request.urlretrieve(model_uri, output_path) - checkpoint = torch.load(output_path, map_location=torch.device(device)) + checkpoint = torch.load(output_path, map_location=torch.device(device), weights_only=True) cfg = WavLMConfig(checkpoint["cfg"]) wavlm = WavLM(cfg).to(device) wavlm.load_state_dict(checkpoint["model"]) diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index d85ca1035a..f52fa028e5 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -119,9 +119,9 @@ "\n", "# load model state\n", "if use_cuda:\n", - " cp = torch.load(MODEL_PATH)\n", + " cp = torch.load(MODEL_PATH, weights_only=True)\n", "else:\n", - " cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n", + " cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage, weights_only=True)\n", "\n", "# load the model\n", "model.load_state_dict(cp['model'])\n", diff --git a/pyproject.toml b/pyproject.toml index 94ed3a2c36..371d0b10dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,10 +44,10 @@ classifiers = [ ] dependencies = [ # Core - "numpy>=1.24.3,<2.0.0", # TODO: remove upper bound after spacy/thinc release + "numpy>=1.25.2,<2.0.0", # TODO: remove upper bound after spacy/thinc release "cython>=0.29.30", "scipy>=1.11.2", - "torch>=2.1", + "torch>=2.4", "torchaudio", "soundfile>=0.12.0", "librosa>=0.10.1",