Merge pull request #3248 from coqui-ai/slacker_deps

Update versions
2023-11-17 15:13:19 +01:00 · 2023-11-17 15:13:19 +01:00 · 14579a4607
parent 7e4375da2b 44880f09ed
commit 14579a4607
7 changed files with 55 additions and 49 deletions
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@ -10,7 +10,7 @@ jobs:
  build-sdist:
    runs-on: ubuntu-20.04
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Verify tag matches version
        run: |
          set -ex
@ -38,7 +38,7 @@ jobs:
      matrix:
        python-version: ["3.9", "3.10", "3.11"]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
--- a/TTS/tts/layers/tortoise/diffusion.py
+++ b/TTS/tts/layers/tortoise/diffusion.py
@ -13,12 +13,18 @@ import math
 import numpy as np
 import torch
 import torch as th
-from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
 from tqdm import tqdm

 from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper

-K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
+try:
+    from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
+
+    K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
+except ImportError:
+    K_DIFFUSION_SAMPLERS = None
+
+
 SAMPLERS = ["dpm++2m", "p", "ddim"]


@ -531,6 +537,8 @@ class GaussianDiffusion:
            if self.conditioning_free is not True:
                raise RuntimeError("cond_free must be true")
            with tqdm(total=self.num_timesteps) as pbar:
+                if K_DIFFUSION_SAMPLERS is None:
+                    raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers")
                return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
        else:
            raise RuntimeError("sampler not impl")
--- a/TTS/tts/layers/xtts/gpt.py
+++ b/TTS/tts/layers/xtts/gpt.py
@ -441,7 +441,9 @@ class GPT(nn.Module):
        audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)

        # Pad mel codes with stop_audio_token
-        audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
+        audio_codes = self.set_mel_padding(
+            audio_codes, code_lengths - 3
+        )  # -3 to get the real code lengths without consider start and stop tokens that was not added yet

        # Build input and target tensors
        # Prepend start token to inputs and append stop token to targets
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@ -1,23 +1,22 @@
 import os
 import re
-import torch
-import pypinyin
 import textwrap
-
 from functools import cached_property
+
+import pypinyin
+import torch
 from hangul_romanize import Transliter
 from hangul_romanize.rule import academic
 from num2words import num2words
+from spacy.lang.ar import Arabic
+from spacy.lang.en import English
+from spacy.lang.es import Spanish
+from spacy.lang.ja import Japanese
+from spacy.lang.zh import Chinese
 from tokenizers import Tokenizer

 from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words

-from spacy.lang.en import English
-from spacy.lang.zh import Chinese
-from spacy.lang.ja import Japanese
-from spacy.lang.ar import Arabic
-from spacy.lang.es import Spanish
-

 def get_spacy_lang(lang):
    if lang == "zh":
@ -32,6 +31,7 @@ def get_spacy_lang(lang):
        # For most languages, Enlish does the job
        return English()

+
 def split_sentence(text, lang, text_split_length=250):
    """Preprocess the input text"""
    text_splits = []
@ -67,6 +67,7 @@ def split_sentence(text, lang, text_split_length=250):

    return text_splits

+
 _whitespace_re = re.compile(r"\s+")

 # List of (regular expression, replacement) pairs for abbreviations:
@ -619,7 +620,7 @@ class VoiceBpeTokenizer:
        return cutlet.Cutlet()

    def check_input_length(self, txt, lang):
-        lang = lang.split("-")[0] # remove the region
+        lang = lang.split("-")[0]  # remove the region
        limit = self.char_limits.get(lang, 250)
        if len(txt) > limit:
            print(
@ -640,7 +641,7 @@ class VoiceBpeTokenizer:
        return txt

    def encode(self, txt, lang):
-        lang = lang.split("-")[0] # remove the region
+        lang = lang.split("-")[0]  # remove the region
        self.check_input_length(txt, lang)
        txt = self.preprocess_text(txt, lang)
        lang = "zh-cn" if lang == "zh" else lang
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@ -513,13 +513,13 @@ class Xtts(BaseTTS):
        enable_text_splitting=False,
        **hf_generate_kwargs,
    ):
-        language = language.split("-")[0] # remove the country code
+        language = language.split("-")[0]  # remove the country code
        length_scale = 1.0 / max(speed, 0.05)
        if enable_text_splitting:
            text = split_sentence(text, language, self.tokenizer.char_limits[language])
        else:
            text = [text]
-        
+
        wavs = []
        gpt_latents_list = []
        for sent in text:
@ -563,9 +563,7 @@ class Xtts(BaseTTS):

                if length_scale != 1.0:
                    gpt_latents = F.interpolate(
-                        gpt_latents.transpose(1, 2),
-                        scale_factor=length_scale,
-                        mode="linear"
+                        gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
                    ).transpose(1, 2)

                gpt_latents_list.append(gpt_latents.cpu())
@ -623,7 +621,7 @@ class Xtts(BaseTTS):
        enable_text_splitting=False,
        **hf_generate_kwargs,
    ):
-        language = language.split("-")[0] # remove the country code
+        language = language.split("-")[0]  # remove the country code
        length_scale = 1.0 / max(speed, 0.05)
        if enable_text_splitting:
            text = split_sentence(text, language, self.tokenizer.char_limits[language])
@ -675,9 +673,7 @@ class Xtts(BaseTTS):
                    gpt_latents = torch.cat(all_latents, dim=0)[None, :]
                    if length_scale != 1.0:
                        gpt_latents = F.interpolate(
-                            gpt_latents.transpose(1, 2),
-                            scale_factor=length_scale,
-                            mode="linear"
+                            gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
                        ).transpose(1, 2)
                    wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
                    wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
--- a/requirements.txt
+++ b/requirements.txt
@ -1,31 +1,31 @@
 # core deps
 numpy==1.22.0;python_version<="3.10"
-numpy==1.24.3;python_version>"3.10"
-cython==0.29.30
+numpy>=1.24.3;python_version>"3.10"
+cython>=0.29.30
 scipy>=1.11.2
 torch>=2.1
 torchaudio
-soundfile==0.12.*
-librosa==0.10.*
-scikit-learn==1.3.0
+soundfile>=0.12.0
+librosa>=0.10.0
+scikit-learn>=1.3.0
 numba==0.55.1;python_version<"3.9"
-numba==0.57.0;python_version>="3.9"
-inflect==5.6.*
-tqdm==4.64.*
-anyascii==0.3.*
-pyyaml==6.*
-fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail
-aiohttp==3.8.*
-packaging==23.1
+numba>=0.57.0;python_version>="3.9"
+inflect>=5.6.0
+tqdm>=4.64.1
+anyascii>=0.3.0
+pyyaml>=6.0
+fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
+aiohttp>=3.8.1
+packaging>=23.1
 # deps for examples
-flask==2.*
+flask>=2.0.1
 # deps for inference
-pysbd==0.3.4
+pysbd>=0.3.4
 # deps for notebooks
-umap-learn==0.5.*
+umap-learn>=0.5.1
 pandas>=1.4,<2.0
 # deps for training
-matplotlib==3.7.*
+matplotlib>=3.7.0
 # coqui stack
 trainer
 # config management
@ -46,12 +46,11 @@ bangla
 bnnumerizer
 bnunicodenormalizer
 #deps for tortoise
-k_diffusion
-einops==0.6.*
-transformers==4.33.*
+einops>=0.6.0
+transformers>=4.33.0
 #deps for bark
-encodec==0.1.*
+encodec>=0.1.1
 # deps for XTTS
-unidecode==1.3.*
+unidecode>=1.3.2
 num2words
 spacy[ja]>=3
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@ -186,7 +186,7 @@ def test_xtts_v2_streaming():
        "en",
        gpt_cond_latent,
        speaker_embedding,
-        speed=1.5
+        speed=1.5,
    )
    wav_chuncks = []
    for i, chunk in enumerate(chunks):
@ -198,7 +198,7 @@ def test_xtts_v2_streaming():
        "en",
        gpt_cond_latent,
        speaker_embedding,
-        speed=0.66
+        speed=0.66,
    )
    wav_chuncks = []
    for i, chunk in enumerate(chunks):