diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 49a5b300..2bbcf3cd 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -10,7 +10,7 @@ jobs: build-sdist: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Verify tag matches version run: | set -ex @@ -38,7 +38,7 @@ jobs: matrix: python-version: ["3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index cb350af7..7bea02ca 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -13,12 +13,18 @@ import math import numpy as np import torch import torch as th -from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral from tqdm import tqdm from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper -K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} +try: + from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral + + K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} +except ImportError: + K_DIFFUSION_SAMPLERS = None + + SAMPLERS = ["dpm++2m", "p", "ddim"] @@ -531,6 +537,8 @@ class GaussianDiffusion: if self.conditioning_free is not True: raise RuntimeError("cond_free must be true") with tqdm(total=self.num_timesteps) as pbar: + if K_DIFFUSION_SAMPLERS is None: + raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers") return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs) else: raise RuntimeError("sampler not impl") diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index d914ebf9..e7b186b8 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -441,7 +441,9 @@ class GPT(nn.Module): audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token) # Pad mel codes with stop_audio_token - audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet + audio_codes = self.set_mel_padding( + audio_codes, code_lengths - 3 + ) # -3 to get the real code lengths without consider start and stop tokens that was not added yet # Build input and target tensors # Prepend start token to inputs and append stop token to targets diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 1ef655a3..52848743 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -1,23 +1,22 @@ import os import re -import torch -import pypinyin import textwrap - from functools import cached_property + +import pypinyin +import torch from hangul_romanize import Transliter from hangul_romanize.rule import academic from num2words import num2words +from spacy.lang.ar import Arabic +from spacy.lang.en import English +from spacy.lang.es import Spanish +from spacy.lang.ja import Japanese +from spacy.lang.zh import Chinese from tokenizers import Tokenizer from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words -from spacy.lang.en import English -from spacy.lang.zh import Chinese -from spacy.lang.ja import Japanese -from spacy.lang.ar import Arabic -from spacy.lang.es import Spanish - def get_spacy_lang(lang): if lang == "zh": @@ -32,6 +31,7 @@ def get_spacy_lang(lang): # For most languages, Enlish does the job return English() + def split_sentence(text, lang, text_split_length=250): """Preprocess the input text""" text_splits = [] @@ -67,6 +67,7 @@ def split_sentence(text, lang, text_split_length=250): return text_splits + _whitespace_re = re.compile(r"\s+") # List of (regular expression, replacement) pairs for abbreviations: @@ -619,7 +620,7 @@ class VoiceBpeTokenizer: return cutlet.Cutlet() def check_input_length(self, txt, lang): - lang = lang.split("-")[0] # remove the region + lang = lang.split("-")[0] # remove the region limit = self.char_limits.get(lang, 250) if len(txt) > limit: print( @@ -640,7 +641,7 @@ class VoiceBpeTokenizer: return txt def encode(self, txt, lang): - lang = lang.split("-")[0] # remove the region + lang = lang.split("-")[0] # remove the region self.check_input_length(txt, lang) txt = self.preprocess_text(txt, lang) lang = "zh-cn" if lang == "zh" else lang diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 3583591f..208ec4d5 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -513,13 +513,13 @@ class Xtts(BaseTTS): enable_text_splitting=False, **hf_generate_kwargs, ): - language = language.split("-")[0] # remove the country code + language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) if enable_text_splitting: text = split_sentence(text, language, self.tokenizer.char_limits[language]) else: text = [text] - + wavs = [] gpt_latents_list = [] for sent in text: @@ -563,9 +563,7 @@ class Xtts(BaseTTS): if length_scale != 1.0: gpt_latents = F.interpolate( - gpt_latents.transpose(1, 2), - scale_factor=length_scale, - mode="linear" + gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear" ).transpose(1, 2) gpt_latents_list.append(gpt_latents.cpu()) @@ -623,7 +621,7 @@ class Xtts(BaseTTS): enable_text_splitting=False, **hf_generate_kwargs, ): - language = language.split("-")[0] # remove the country code + language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) if enable_text_splitting: text = split_sentence(text, language, self.tokenizer.char_limits[language]) @@ -675,9 +673,7 @@ class Xtts(BaseTTS): gpt_latents = torch.cat(all_latents, dim=0)[None, :] if length_scale != 1.0: gpt_latents = F.interpolate( - gpt_latents.transpose(1, 2), - scale_factor=length_scale, - mode="linear" + gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear" ).transpose(1, 2) wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device)) wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks( diff --git a/requirements.txt b/requirements.txt index 836de40a..ce0e5d92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,31 +1,31 @@ # core deps numpy==1.22.0;python_version<="3.10" -numpy==1.24.3;python_version>"3.10" -cython==0.29.30 +numpy>=1.24.3;python_version>"3.10" +cython>=0.29.30 scipy>=1.11.2 torch>=2.1 torchaudio -soundfile==0.12.* -librosa==0.10.* -scikit-learn==1.3.0 +soundfile>=0.12.0 +librosa>=0.10.0 +scikit-learn>=1.3.0 numba==0.55.1;python_version<"3.9" -numba==0.57.0;python_version>="3.9" -inflect==5.6.* -tqdm==4.64.* -anyascii==0.3.* -pyyaml==6.* -fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail -aiohttp==3.8.* -packaging==23.1 +numba>=0.57.0;python_version>="3.9" +inflect>=5.6.0 +tqdm>=4.64.1 +anyascii>=0.3.0 +pyyaml>=6.0 +fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail +aiohttp>=3.8.1 +packaging>=23.1 # deps for examples -flask==2.* +flask>=2.0.1 # deps for inference -pysbd==0.3.4 +pysbd>=0.3.4 # deps for notebooks -umap-learn==0.5.* +umap-learn>=0.5.1 pandas>=1.4,<2.0 # deps for training -matplotlib==3.7.* +matplotlib>=3.7.0 # coqui stack trainer # config management @@ -46,12 +46,11 @@ bangla bnnumerizer bnunicodenormalizer #deps for tortoise -k_diffusion -einops==0.6.* -transformers==4.33.* +einops>=0.6.0 +transformers>=4.33.0 #deps for bark -encodec==0.1.* +encodec>=0.1.1 # deps for XTTS -unidecode==1.3.* +unidecode>=1.3.2 num2words spacy[ja]>=3 \ No newline at end of file diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index a5aad5c1..8fa56e28 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -186,7 +186,7 @@ def test_xtts_v2_streaming(): "en", gpt_cond_latent, speaker_embedding, - speed=1.5 + speed=1.5, ) wav_chuncks = [] for i, chunk in enumerate(chunks): @@ -198,7 +198,7 @@ def test_xtts_v2_streaming(): "en", gpt_cond_latent, speaker_embedding, - speed=0.66 + speed=0.66, ) wav_chuncks = [] for i, chunk in enumerate(chunks):