Merge pull request #3248 from coqui-ai/slacker_deps

Update versions
pull/3239/head
Eren Gölge 2023-11-17 15:13:19 +01:00 committed by GitHub
commit 14579a4607
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 55 additions and 49 deletions

View File

@ -10,7 +10,7 @@ jobs:
build-sdist: build-sdist:
runs-on: ubuntu-20.04 runs-on: ubuntu-20.04
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Verify tag matches version - name: Verify tag matches version
run: | run: |
set -ex set -ex
@ -38,7 +38,7 @@ jobs:
matrix: matrix:
python-version: ["3.9", "3.10", "3.11"] python-version: ["3.9", "3.10", "3.11"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- uses: actions/setup-python@v2 - uses: actions/setup-python@v2
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}

View File

@ -13,12 +13,18 @@ import math
import numpy as np import numpy as np
import torch import torch
import torch as th import torch as th
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
from tqdm import tqdm from tqdm import tqdm
from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper
try:
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
except ImportError:
K_DIFFUSION_SAMPLERS = None
SAMPLERS = ["dpm++2m", "p", "ddim"] SAMPLERS = ["dpm++2m", "p", "ddim"]
@ -531,6 +537,8 @@ class GaussianDiffusion:
if self.conditioning_free is not True: if self.conditioning_free is not True:
raise RuntimeError("cond_free must be true") raise RuntimeError("cond_free must be true")
with tqdm(total=self.num_timesteps) as pbar: with tqdm(total=self.num_timesteps) as pbar:
if K_DIFFUSION_SAMPLERS is None:
raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers")
return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs) return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
else: else:
raise RuntimeError("sampler not impl") raise RuntimeError("sampler not impl")

View File

@ -441,7 +441,9 @@ class GPT(nn.Module):
audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token) audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)
# Pad mel codes with stop_audio_token # Pad mel codes with stop_audio_token
audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet audio_codes = self.set_mel_padding(
audio_codes, code_lengths - 3
) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
# Build input and target tensors # Build input and target tensors
# Prepend start token to inputs and append stop token to targets # Prepend start token to inputs and append stop token to targets

View File

@ -1,23 +1,22 @@
import os import os
import re import re
import torch
import pypinyin
import textwrap import textwrap
from functools import cached_property from functools import cached_property
import pypinyin
import torch
from hangul_romanize import Transliter from hangul_romanize import Transliter
from hangul_romanize.rule import academic from hangul_romanize.rule import academic
from num2words import num2words from num2words import num2words
from spacy.lang.ar import Arabic
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.lang.ja import Japanese
from spacy.lang.zh import Chinese
from tokenizers import Tokenizer from tokenizers import Tokenizer
from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
from spacy.lang.en import English
from spacy.lang.zh import Chinese
from spacy.lang.ja import Japanese
from spacy.lang.ar import Arabic
from spacy.lang.es import Spanish
def get_spacy_lang(lang): def get_spacy_lang(lang):
if lang == "zh": if lang == "zh":
@ -32,6 +31,7 @@ def get_spacy_lang(lang):
# For most languages, Enlish does the job # For most languages, Enlish does the job
return English() return English()
def split_sentence(text, lang, text_split_length=250): def split_sentence(text, lang, text_split_length=250):
"""Preprocess the input text""" """Preprocess the input text"""
text_splits = [] text_splits = []
@ -67,6 +67,7 @@ def split_sentence(text, lang, text_split_length=250):
return text_splits return text_splits
_whitespace_re = re.compile(r"\s+") _whitespace_re = re.compile(r"\s+")
# List of (regular expression, replacement) pairs for abbreviations: # List of (regular expression, replacement) pairs for abbreviations:

View File

@ -563,9 +563,7 @@ class Xtts(BaseTTS):
if length_scale != 1.0: if length_scale != 1.0:
gpt_latents = F.interpolate( gpt_latents = F.interpolate(
gpt_latents.transpose(1, 2), gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
scale_factor=length_scale,
mode="linear"
).transpose(1, 2) ).transpose(1, 2)
gpt_latents_list.append(gpt_latents.cpu()) gpt_latents_list.append(gpt_latents.cpu())
@ -675,9 +673,7 @@ class Xtts(BaseTTS):
gpt_latents = torch.cat(all_latents, dim=0)[None, :] gpt_latents = torch.cat(all_latents, dim=0)[None, :]
if length_scale != 1.0: if length_scale != 1.0:
gpt_latents = F.interpolate( gpt_latents = F.interpolate(
gpt_latents.transpose(1, 2), gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
scale_factor=length_scale,
mode="linear"
).transpose(1, 2) ).transpose(1, 2)
wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device)) wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks( wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(

View File

@ -1,31 +1,31 @@
# core deps # core deps
numpy==1.22.0;python_version<="3.10" numpy==1.22.0;python_version<="3.10"
numpy==1.24.3;python_version>"3.10" numpy>=1.24.3;python_version>"3.10"
cython==0.29.30 cython>=0.29.30
scipy>=1.11.2 scipy>=1.11.2
torch>=2.1 torch>=2.1
torchaudio torchaudio
soundfile==0.12.* soundfile>=0.12.0
librosa==0.10.* librosa>=0.10.0
scikit-learn==1.3.0 scikit-learn>=1.3.0
numba==0.55.1;python_version<"3.9" numba==0.55.1;python_version<"3.9"
numba==0.57.0;python_version>="3.9" numba>=0.57.0;python_version>="3.9"
inflect==5.6.* inflect>=5.6.0
tqdm==4.64.* tqdm>=4.64.1
anyascii==0.3.* anyascii>=0.3.0
pyyaml==6.* pyyaml>=6.0
fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
aiohttp==3.8.* aiohttp>=3.8.1
packaging==23.1 packaging>=23.1
# deps for examples # deps for examples
flask==2.* flask>=2.0.1
# deps for inference # deps for inference
pysbd==0.3.4 pysbd>=0.3.4
# deps for notebooks # deps for notebooks
umap-learn==0.5.* umap-learn>=0.5.1
pandas>=1.4,<2.0 pandas>=1.4,<2.0
# deps for training # deps for training
matplotlib==3.7.* matplotlib>=3.7.0
# coqui stack # coqui stack
trainer trainer
# config management # config management
@ -46,12 +46,11 @@ bangla
bnnumerizer bnnumerizer
bnunicodenormalizer bnunicodenormalizer
#deps for tortoise #deps for tortoise
k_diffusion einops>=0.6.0
einops==0.6.* transformers>=4.33.0
transformers==4.33.*
#deps for bark #deps for bark
encodec==0.1.* encodec>=0.1.1
# deps for XTTS # deps for XTTS
unidecode==1.3.* unidecode>=1.3.2
num2words num2words
spacy[ja]>=3 spacy[ja]>=3

View File

@ -186,7 +186,7 @@ def test_xtts_v2_streaming():
"en", "en",
gpt_cond_latent, gpt_cond_latent,
speaker_embedding, speaker_embedding,
speed=1.5 speed=1.5,
) )
wav_chuncks = [] wav_chuncks = []
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
@ -198,7 +198,7 @@ def test_xtts_v2_streaming():
"en", "en",
gpt_cond_latent, gpt_cond_latent,
speaker_embedding, speaker_embedding,
speed=0.66 speed=0.66,
) )
wav_chuncks = [] wav_chuncks = []
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):