mirror of https://github.com/coqui-ai/TTS.git
commit
14579a4607
|
@ -10,7 +10,7 @@ jobs:
|
||||||
build-sdist:
|
build-sdist:
|
||||||
runs-on: ubuntu-20.04
|
runs-on: ubuntu-20.04
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Verify tag matches version
|
- name: Verify tag matches version
|
||||||
run: |
|
run: |
|
||||||
set -ex
|
set -ex
|
||||||
|
@ -38,7 +38,7 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.9", "3.10", "3.11"]
|
python-version: ["3.9", "3.10", "3.11"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- uses: actions/setup-python@v2
|
- uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
|
@ -13,12 +13,18 @@ import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torch as th
|
import torch as th
|
||||||
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper
|
from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper
|
||||||
|
|
||||||
|
try:
|
||||||
|
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
|
||||||
|
|
||||||
K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
|
K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
|
||||||
|
except ImportError:
|
||||||
|
K_DIFFUSION_SAMPLERS = None
|
||||||
|
|
||||||
|
|
||||||
SAMPLERS = ["dpm++2m", "p", "ddim"]
|
SAMPLERS = ["dpm++2m", "p", "ddim"]
|
||||||
|
|
||||||
|
|
||||||
|
@ -531,6 +537,8 @@ class GaussianDiffusion:
|
||||||
if self.conditioning_free is not True:
|
if self.conditioning_free is not True:
|
||||||
raise RuntimeError("cond_free must be true")
|
raise RuntimeError("cond_free must be true")
|
||||||
with tqdm(total=self.num_timesteps) as pbar:
|
with tqdm(total=self.num_timesteps) as pbar:
|
||||||
|
if K_DIFFUSION_SAMPLERS is None:
|
||||||
|
raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers")
|
||||||
return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
|
return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("sampler not impl")
|
raise RuntimeError("sampler not impl")
|
||||||
|
|
|
@ -441,7 +441,9 @@ class GPT(nn.Module):
|
||||||
audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)
|
audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)
|
||||||
|
|
||||||
# Pad mel codes with stop_audio_token
|
# Pad mel codes with stop_audio_token
|
||||||
audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
|
audio_codes = self.set_mel_padding(
|
||||||
|
audio_codes, code_lengths - 3
|
||||||
|
) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
|
||||||
|
|
||||||
# Build input and target tensors
|
# Build input and target tensors
|
||||||
# Prepend start token to inputs and append stop token to targets
|
# Prepend start token to inputs and append stop token to targets
|
||||||
|
|
|
@ -1,23 +1,22 @@
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import torch
|
|
||||||
import pypinyin
|
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
|
|
||||||
|
import pypinyin
|
||||||
|
import torch
|
||||||
from hangul_romanize import Transliter
|
from hangul_romanize import Transliter
|
||||||
from hangul_romanize.rule import academic
|
from hangul_romanize.rule import academic
|
||||||
from num2words import num2words
|
from num2words import num2words
|
||||||
|
from spacy.lang.ar import Arabic
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.lang.es import Spanish
|
||||||
|
from spacy.lang.ja import Japanese
|
||||||
|
from spacy.lang.zh import Chinese
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
|
|
||||||
from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
|
from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
|
||||||
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.lang.zh import Chinese
|
|
||||||
from spacy.lang.ja import Japanese
|
|
||||||
from spacy.lang.ar import Arabic
|
|
||||||
from spacy.lang.es import Spanish
|
|
||||||
|
|
||||||
|
|
||||||
def get_spacy_lang(lang):
|
def get_spacy_lang(lang):
|
||||||
if lang == "zh":
|
if lang == "zh":
|
||||||
|
@ -32,6 +31,7 @@ def get_spacy_lang(lang):
|
||||||
# For most languages, Enlish does the job
|
# For most languages, Enlish does the job
|
||||||
return English()
|
return English()
|
||||||
|
|
||||||
|
|
||||||
def split_sentence(text, lang, text_split_length=250):
|
def split_sentence(text, lang, text_split_length=250):
|
||||||
"""Preprocess the input text"""
|
"""Preprocess the input text"""
|
||||||
text_splits = []
|
text_splits = []
|
||||||
|
@ -67,6 +67,7 @@ def split_sentence(text, lang, text_split_length=250):
|
||||||
|
|
||||||
return text_splits
|
return text_splits
|
||||||
|
|
||||||
|
|
||||||
_whitespace_re = re.compile(r"\s+")
|
_whitespace_re = re.compile(r"\s+")
|
||||||
|
|
||||||
# List of (regular expression, replacement) pairs for abbreviations:
|
# List of (regular expression, replacement) pairs for abbreviations:
|
||||||
|
|
|
@ -563,9 +563,7 @@ class Xtts(BaseTTS):
|
||||||
|
|
||||||
if length_scale != 1.0:
|
if length_scale != 1.0:
|
||||||
gpt_latents = F.interpolate(
|
gpt_latents = F.interpolate(
|
||||||
gpt_latents.transpose(1, 2),
|
gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
|
||||||
scale_factor=length_scale,
|
|
||||||
mode="linear"
|
|
||||||
).transpose(1, 2)
|
).transpose(1, 2)
|
||||||
|
|
||||||
gpt_latents_list.append(gpt_latents.cpu())
|
gpt_latents_list.append(gpt_latents.cpu())
|
||||||
|
@ -675,9 +673,7 @@ class Xtts(BaseTTS):
|
||||||
gpt_latents = torch.cat(all_latents, dim=0)[None, :]
|
gpt_latents = torch.cat(all_latents, dim=0)[None, :]
|
||||||
if length_scale != 1.0:
|
if length_scale != 1.0:
|
||||||
gpt_latents = F.interpolate(
|
gpt_latents = F.interpolate(
|
||||||
gpt_latents.transpose(1, 2),
|
gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
|
||||||
scale_factor=length_scale,
|
|
||||||
mode="linear"
|
|
||||||
).transpose(1, 2)
|
).transpose(1, 2)
|
||||||
wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
|
wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
|
||||||
wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
|
wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
|
||||||
|
|
|
@ -1,31 +1,31 @@
|
||||||
# core deps
|
# core deps
|
||||||
numpy==1.22.0;python_version<="3.10"
|
numpy==1.22.0;python_version<="3.10"
|
||||||
numpy==1.24.3;python_version>"3.10"
|
numpy>=1.24.3;python_version>"3.10"
|
||||||
cython==0.29.30
|
cython>=0.29.30
|
||||||
scipy>=1.11.2
|
scipy>=1.11.2
|
||||||
torch>=2.1
|
torch>=2.1
|
||||||
torchaudio
|
torchaudio
|
||||||
soundfile==0.12.*
|
soundfile>=0.12.0
|
||||||
librosa==0.10.*
|
librosa>=0.10.0
|
||||||
scikit-learn==1.3.0
|
scikit-learn>=1.3.0
|
||||||
numba==0.55.1;python_version<"3.9"
|
numba==0.55.1;python_version<"3.9"
|
||||||
numba==0.57.0;python_version>="3.9"
|
numba>=0.57.0;python_version>="3.9"
|
||||||
inflect==5.6.*
|
inflect>=5.6.0
|
||||||
tqdm==4.64.*
|
tqdm>=4.64.1
|
||||||
anyascii==0.3.*
|
anyascii>=0.3.0
|
||||||
pyyaml==6.*
|
pyyaml>=6.0
|
||||||
fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail
|
fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
|
||||||
aiohttp==3.8.*
|
aiohttp>=3.8.1
|
||||||
packaging==23.1
|
packaging>=23.1
|
||||||
# deps for examples
|
# deps for examples
|
||||||
flask==2.*
|
flask>=2.0.1
|
||||||
# deps for inference
|
# deps for inference
|
||||||
pysbd==0.3.4
|
pysbd>=0.3.4
|
||||||
# deps for notebooks
|
# deps for notebooks
|
||||||
umap-learn==0.5.*
|
umap-learn>=0.5.1
|
||||||
pandas>=1.4,<2.0
|
pandas>=1.4,<2.0
|
||||||
# deps for training
|
# deps for training
|
||||||
matplotlib==3.7.*
|
matplotlib>=3.7.0
|
||||||
# coqui stack
|
# coqui stack
|
||||||
trainer
|
trainer
|
||||||
# config management
|
# config management
|
||||||
|
@ -46,12 +46,11 @@ bangla
|
||||||
bnnumerizer
|
bnnumerizer
|
||||||
bnunicodenormalizer
|
bnunicodenormalizer
|
||||||
#deps for tortoise
|
#deps for tortoise
|
||||||
k_diffusion
|
einops>=0.6.0
|
||||||
einops==0.6.*
|
transformers>=4.33.0
|
||||||
transformers==4.33.*
|
|
||||||
#deps for bark
|
#deps for bark
|
||||||
encodec==0.1.*
|
encodec>=0.1.1
|
||||||
# deps for XTTS
|
# deps for XTTS
|
||||||
unidecode==1.3.*
|
unidecode>=1.3.2
|
||||||
num2words
|
num2words
|
||||||
spacy[ja]>=3
|
spacy[ja]>=3
|
|
@ -186,7 +186,7 @@ def test_xtts_v2_streaming():
|
||||||
"en",
|
"en",
|
||||||
gpt_cond_latent,
|
gpt_cond_latent,
|
||||||
speaker_embedding,
|
speaker_embedding,
|
||||||
speed=1.5
|
speed=1.5,
|
||||||
)
|
)
|
||||||
wav_chuncks = []
|
wav_chuncks = []
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
|
@ -198,7 +198,7 @@ def test_xtts_v2_streaming():
|
||||||
"en",
|
"en",
|
||||||
gpt_cond_latent,
|
gpt_cond_latent,
|
||||||
speaker_embedding,
|
speaker_embedding,
|
||||||
speed=0.66
|
speed=0.66,
|
||||||
)
|
)
|
||||||
wav_chuncks = []
|
wav_chuncks = []
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
|
|
Loading…
Reference in New Issue