Merge pull request #3248 from coqui-ai/slacker_deps

Update versions
2023-11-17 15:13:19 +01:00 · 2023-11-17 15:13:19 +01:00 · 14579a4607
parent 7e4375da2b 44880f09ed
commit 14579a4607
7 changed files with 55 additions and 49 deletions
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@ -10,7 +10,7 @@ jobs:
  build-sdist:
    runs-on: ubuntu-20.04
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Verify tag matches version
        run: |
          set -ex
@ -38,7 +38,7 @@ jobs:
      matrix:
        python-version: ["3.9", "3.10", "3.11"]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
--- a/TTS/tts/layers/tortoise/diffusion.py
+++ b/TTS/tts/layers/tortoise/diffusion.py
@ -13,12 +13,18 @@ import math
 import numpy as np
 import torch
 import torch as th
 from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
 from tqdm import tqdm
 from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper
 try:
    from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
    K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
 except ImportError:
    K_DIFFUSION_SAMPLERS = None
 SAMPLERS = ["dpm++2m", "p", "ddim"]
@ -531,6 +537,8 @@ class GaussianDiffusion:
            if self.conditioning_free is not True:
                raise RuntimeError("cond_free must be true")
            with tqdm(total=self.num_timesteps) as pbar:
                if K_DIFFUSION_SAMPLERS is None:
                    raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers")
                return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
        else:
            raise RuntimeError("sampler not impl")
--- a/TTS/tts/layers/xtts/gpt.py
+++ b/TTS/tts/layers/xtts/gpt.py
@ -441,7 +441,9 @@ class GPT(nn.Module):
        audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)
        # Pad mel codes with stop_audio_token
-        audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
+        audio_codes = self.set_mel_padding(
            audio_codes, code_lengths - 3
        )  # -3 to get the real code lengths without consider start and stop tokens that was not added yet
        # Build input and target tensors
        # Prepend start token to inputs and append stop token to targets
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@ -1,23 +1,22 @@
 import os
 import re
 import torch
 import pypinyin
 import textwrap
 from functools import cached_property
 import pypinyin
 import torch
 from hangul_romanize import Transliter
 from hangul_romanize.rule import academic
 from num2words import num2words
 from spacy.lang.ar import Arabic
 from spacy.lang.en import English
 from spacy.lang.es import Spanish
 from spacy.lang.ja import Japanese
 from spacy.lang.zh import Chinese
 from tokenizers import Tokenizer
 from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
 from spacy.lang.en import English
 from spacy.lang.zh import Chinese
 from spacy.lang.ja import Japanese
 from spacy.lang.ar import Arabic
 from spacy.lang.es import Spanish
 def get_spacy_lang(lang):
    if lang == "zh":
@ -32,6 +31,7 @@ def get_spacy_lang(lang):
        # For most languages, Enlish does the job
        return English()
 def split_sentence(text, lang, text_split_length=250):
    """Preprocess the input text"""
    text_splits = []
@ -67,6 +67,7 @@ def split_sentence(text, lang, text_split_length=250):
    return text_splits
 _whitespace_re = re.compile(r"\s+")
 # List of (regular expression, replacement) pairs for abbreviations:
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@ -563,9 +563,7 @@ class Xtts(BaseTTS):
                if length_scale != 1.0:
                    gpt_latents = F.interpolate(
-                        gpt_latents.transpose(1, 2),
+                        gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
                        scale_factor=length_scale,
                        mode="linear"
                    ).transpose(1, 2)
                gpt_latents_list.append(gpt_latents.cpu())
@ -675,9 +673,7 @@ class Xtts(BaseTTS):
                    gpt_latents = torch.cat(all_latents, dim=0)[None, :]
                    if length_scale != 1.0:
                        gpt_latents = F.interpolate(
-                            gpt_latents.transpose(1, 2),
+                            gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
                            scale_factor=length_scale,
                            mode="linear"
                        ).transpose(1, 2)
                    wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
                    wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
--- a/requirements.txt
+++ b/requirements.txt
@ -1,31 +1,31 @@
 # core deps
 numpy==1.22.0;python_version<="3.10"
-numpy==1.24.3;python_version>"3.10"
+numpy>=1.24.3;python_version>"3.10"
-cython==0.29.30
+cython>=0.29.30
 scipy>=1.11.2
 torch>=2.1
 torchaudio
-soundfile==0.12.*
+soundfile>=0.12.0
-librosa==0.10.*
+librosa>=0.10.0
-scikit-learn==1.3.0
+scikit-learn>=1.3.0
 numba==0.55.1;python_version<"3.9"
-numba==0.57.0;python_version>="3.9"
+numba>=0.57.0;python_version>="3.9"
-inflect==5.6.*
+inflect>=5.6.0
-tqdm==4.64.*
+tqdm>=4.64.1
-anyascii==0.3.*
+anyascii>=0.3.0
-pyyaml==6.*
+pyyaml>=6.0
-fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail
+fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
-aiohttp==3.8.*
+aiohttp>=3.8.1
-packaging==23.1
+packaging>=23.1
 # deps for examples
-flask==2.*
+flask>=2.0.1
 # deps for inference
-pysbd==0.3.4
+pysbd>=0.3.4
 # deps for notebooks
-umap-learn==0.5.*
+umap-learn>=0.5.1
 pandas>=1.4,<2.0
 # deps for training
-matplotlib==3.7.*
+matplotlib>=3.7.0
 # coqui stack
 trainer
 # config management
@ -46,12 +46,11 @@ bangla
 bnnumerizer
 bnunicodenormalizer
 #deps for tortoise
-k_diffusion
+einops>=0.6.0
-einops==0.6.*
+transformers>=4.33.0
 transformers==4.33.*
 #deps for bark
-encodec==0.1.*
+encodec>=0.1.1
 # deps for XTTS
-unidecode==1.3.*
+unidecode>=1.3.2
 num2words
 spacy[ja]>=3
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@ -186,7 +186,7 @@ def test_xtts_v2_streaming():
        "en",
        gpt_cond_latent,
        speaker_embedding,
-        speed=1.5
+        speed=1.5,
    )
    wav_chuncks = []
    for i, chunk in enumerate(chunks):
@ -198,7 +198,7 @@ def test_xtts_v2_streaming():
        "en",
        gpt_cond_latent,
        speaker_embedding,
-        speed=0.66
+        speed=0.66,
    )
    wav_chuncks = []
    for i, chunk in enumerate(chunks):