mirror of https://github.com/coqui-ai/TTS.git
commit
14579a4607
|
@ -10,7 +10,7 @@ jobs:
|
|||
build-sdist:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Verify tag matches version
|
||||
run: |
|
||||
set -ex
|
||||
|
@ -38,7 +38,7 @@ jobs:
|
|||
matrix:
|
||||
python-version: ["3.9", "3.10", "3.11"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
|
|
@ -13,12 +13,18 @@ import math
|
|||
import numpy as np
|
||||
import torch
|
||||
import torch as th
|
||||
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper
|
||||
|
||||
K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
|
||||
try:
|
||||
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
|
||||
|
||||
K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
|
||||
except ImportError:
|
||||
K_DIFFUSION_SAMPLERS = None
|
||||
|
||||
|
||||
SAMPLERS = ["dpm++2m", "p", "ddim"]
|
||||
|
||||
|
||||
|
@ -531,6 +537,8 @@ class GaussianDiffusion:
|
|||
if self.conditioning_free is not True:
|
||||
raise RuntimeError("cond_free must be true")
|
||||
with tqdm(total=self.num_timesteps) as pbar:
|
||||
if K_DIFFUSION_SAMPLERS is None:
|
||||
raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers")
|
||||
return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
|
||||
else:
|
||||
raise RuntimeError("sampler not impl")
|
||||
|
|
|
@ -441,7 +441,9 @@ class GPT(nn.Module):
|
|||
audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)
|
||||
|
||||
# Pad mel codes with stop_audio_token
|
||||
audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
|
||||
audio_codes = self.set_mel_padding(
|
||||
audio_codes, code_lengths - 3
|
||||
) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
|
||||
|
||||
# Build input and target tensors
|
||||
# Prepend start token to inputs and append stop token to targets
|
||||
|
|
|
@ -1,23 +1,22 @@
|
|||
import os
|
||||
import re
|
||||
import torch
|
||||
import pypinyin
|
||||
import textwrap
|
||||
|
||||
from functools import cached_property
|
||||
|
||||
import pypinyin
|
||||
import torch
|
||||
from hangul_romanize import Transliter
|
||||
from hangul_romanize.rule import academic
|
||||
from num2words import num2words
|
||||
from spacy.lang.ar import Arabic
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.es import Spanish
|
||||
from spacy.lang.ja import Japanese
|
||||
from spacy.lang.zh import Chinese
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.zh import Chinese
|
||||
from spacy.lang.ja import Japanese
|
||||
from spacy.lang.ar import Arabic
|
||||
from spacy.lang.es import Spanish
|
||||
|
||||
|
||||
def get_spacy_lang(lang):
|
||||
if lang == "zh":
|
||||
|
@ -32,6 +31,7 @@ def get_spacy_lang(lang):
|
|||
# For most languages, Enlish does the job
|
||||
return English()
|
||||
|
||||
|
||||
def split_sentence(text, lang, text_split_length=250):
|
||||
"""Preprocess the input text"""
|
||||
text_splits = []
|
||||
|
@ -67,6 +67,7 @@ def split_sentence(text, lang, text_split_length=250):
|
|||
|
||||
return text_splits
|
||||
|
||||
|
||||
_whitespace_re = re.compile(r"\s+")
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
|
@ -619,7 +620,7 @@ class VoiceBpeTokenizer:
|
|||
return cutlet.Cutlet()
|
||||
|
||||
def check_input_length(self, txt, lang):
|
||||
lang = lang.split("-")[0] # remove the region
|
||||
lang = lang.split("-")[0] # remove the region
|
||||
limit = self.char_limits.get(lang, 250)
|
||||
if len(txt) > limit:
|
||||
print(
|
||||
|
@ -640,7 +641,7 @@ class VoiceBpeTokenizer:
|
|||
return txt
|
||||
|
||||
def encode(self, txt, lang):
|
||||
lang = lang.split("-")[0] # remove the region
|
||||
lang = lang.split("-")[0] # remove the region
|
||||
self.check_input_length(txt, lang)
|
||||
txt = self.preprocess_text(txt, lang)
|
||||
lang = "zh-cn" if lang == "zh" else lang
|
||||
|
|
|
@ -513,13 +513,13 @@ class Xtts(BaseTTS):
|
|||
enable_text_splitting=False,
|
||||
**hf_generate_kwargs,
|
||||
):
|
||||
language = language.split("-")[0] # remove the country code
|
||||
language = language.split("-")[0] # remove the country code
|
||||
length_scale = 1.0 / max(speed, 0.05)
|
||||
if enable_text_splitting:
|
||||
text = split_sentence(text, language, self.tokenizer.char_limits[language])
|
||||
else:
|
||||
text = [text]
|
||||
|
||||
|
||||
wavs = []
|
||||
gpt_latents_list = []
|
||||
for sent in text:
|
||||
|
@ -563,9 +563,7 @@ class Xtts(BaseTTS):
|
|||
|
||||
if length_scale != 1.0:
|
||||
gpt_latents = F.interpolate(
|
||||
gpt_latents.transpose(1, 2),
|
||||
scale_factor=length_scale,
|
||||
mode="linear"
|
||||
gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
|
||||
).transpose(1, 2)
|
||||
|
||||
gpt_latents_list.append(gpt_latents.cpu())
|
||||
|
@ -623,7 +621,7 @@ class Xtts(BaseTTS):
|
|||
enable_text_splitting=False,
|
||||
**hf_generate_kwargs,
|
||||
):
|
||||
language = language.split("-")[0] # remove the country code
|
||||
language = language.split("-")[0] # remove the country code
|
||||
length_scale = 1.0 / max(speed, 0.05)
|
||||
if enable_text_splitting:
|
||||
text = split_sentence(text, language, self.tokenizer.char_limits[language])
|
||||
|
@ -675,9 +673,7 @@ class Xtts(BaseTTS):
|
|||
gpt_latents = torch.cat(all_latents, dim=0)[None, :]
|
||||
if length_scale != 1.0:
|
||||
gpt_latents = F.interpolate(
|
||||
gpt_latents.transpose(1, 2),
|
||||
scale_factor=length_scale,
|
||||
mode="linear"
|
||||
gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
|
||||
).transpose(1, 2)
|
||||
wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
|
||||
wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
|
||||
|
|
|
@ -1,31 +1,31 @@
|
|||
# core deps
|
||||
numpy==1.22.0;python_version<="3.10"
|
||||
numpy==1.24.3;python_version>"3.10"
|
||||
cython==0.29.30
|
||||
numpy>=1.24.3;python_version>"3.10"
|
||||
cython>=0.29.30
|
||||
scipy>=1.11.2
|
||||
torch>=2.1
|
||||
torchaudio
|
||||
soundfile==0.12.*
|
||||
librosa==0.10.*
|
||||
scikit-learn==1.3.0
|
||||
soundfile>=0.12.0
|
||||
librosa>=0.10.0
|
||||
scikit-learn>=1.3.0
|
||||
numba==0.55.1;python_version<"3.9"
|
||||
numba==0.57.0;python_version>="3.9"
|
||||
inflect==5.6.*
|
||||
tqdm==4.64.*
|
||||
anyascii==0.3.*
|
||||
pyyaml==6.*
|
||||
fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail
|
||||
aiohttp==3.8.*
|
||||
packaging==23.1
|
||||
numba>=0.57.0;python_version>="3.9"
|
||||
inflect>=5.6.0
|
||||
tqdm>=4.64.1
|
||||
anyascii>=0.3.0
|
||||
pyyaml>=6.0
|
||||
fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
|
||||
aiohttp>=3.8.1
|
||||
packaging>=23.1
|
||||
# deps for examples
|
||||
flask==2.*
|
||||
flask>=2.0.1
|
||||
# deps for inference
|
||||
pysbd==0.3.4
|
||||
pysbd>=0.3.4
|
||||
# deps for notebooks
|
||||
umap-learn==0.5.*
|
||||
umap-learn>=0.5.1
|
||||
pandas>=1.4,<2.0
|
||||
# deps for training
|
||||
matplotlib==3.7.*
|
||||
matplotlib>=3.7.0
|
||||
# coqui stack
|
||||
trainer
|
||||
# config management
|
||||
|
@ -46,12 +46,11 @@ bangla
|
|||
bnnumerizer
|
||||
bnunicodenormalizer
|
||||
#deps for tortoise
|
||||
k_diffusion
|
||||
einops==0.6.*
|
||||
transformers==4.33.*
|
||||
einops>=0.6.0
|
||||
transformers>=4.33.0
|
||||
#deps for bark
|
||||
encodec==0.1.*
|
||||
encodec>=0.1.1
|
||||
# deps for XTTS
|
||||
unidecode==1.3.*
|
||||
unidecode>=1.3.2
|
||||
num2words
|
||||
spacy[ja]>=3
|
|
@ -186,7 +186,7 @@ def test_xtts_v2_streaming():
|
|||
"en",
|
||||
gpt_cond_latent,
|
||||
speaker_embedding,
|
||||
speed=1.5
|
||||
speed=1.5,
|
||||
)
|
||||
wav_chuncks = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
|
@ -198,7 +198,7 @@ def test_xtts_v2_streaming():
|
|||
"en",
|
||||
gpt_cond_latent,
|
||||
speaker_embedding,
|
||||
speed=0.66
|
||||
speed=0.66,
|
||||
)
|
||||
wav_chuncks = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
|
|
Loading…
Reference in New Issue