Merge branch 'coqui-ai:dev' into dev

pull/2735/head
Frederico S. Oliveira 2023-11-30 14:19:05 -03:00 committed by GitHub
commit a26e51b0b4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 183 additions and 31 deletions

View File

@ -72,7 +72,7 @@ Please use our dedicated channels for questions and discussion. Help is much mor
| Type | Links |
| ------------------------------- | --------------------------------------- |
| 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#install-tts)|
| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)|
| 👩‍💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)|
| 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
| 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)|

View File

@ -1 +1 @@
0.21.1
0.21.2

View File

@ -12,6 +12,7 @@ from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from TTS.config import load_config
class TTS(nn.Module):
"""TODO: Add voice conversion and Capacitron support."""
@ -75,11 +76,13 @@ class TTS(nn.Module):
if gpu:
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
if model_name is not None:
if model_name is not None and len(model_name) > 0:
if "tts_models" in model_name or "coqui_studio" in model_name:
self.load_tts_model_by_name(model_name, gpu)
elif "voice_conversion_models" in model_name:
self.load_vc_model_by_name(model_name, gpu)
else:
self.load_model_by_name(model_name, gpu)
if model_path:
self.load_tts_model_by_path(
@ -105,8 +108,12 @@ class TTS(nn.Module):
@property
def is_multi_lingual(self):
# Not sure what sets this to None, but applied a fix to prevent crashing.
if (isinstance(self.model_name, str) and "xtts" in self.model_name or
self.config and ("xtts" in self.config.model or len(self.config.languages) > 1)):
if (
isinstance(self.model_name, str)
and "xtts" in self.model_name
or self.config
and ("xtts" in self.config.model or len(self.config.languages) > 1)
):
return True
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
return self.synthesizer.tts_model.language_manager.num_languages > 1
@ -149,6 +156,15 @@ class TTS(nn.Module):
vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
return model_path, config_path, vocoder_path, vocoder_config_path, None
def load_model_by_name(self, model_name: str, gpu: bool = False):
"""Load one of the 🐸TTS models by name.
Args:
model_name (str): Model name to load. You can list models by ```tts.models```.
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
self.load_tts_model_by_name(model_name, gpu)
def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
"""Load one of the voice conversion models by name.
@ -310,6 +326,7 @@ class TTS(nn.Module):
speaker_wav: str = None,
emotion: str = None,
speed: float = None,
split_sentences: bool = True,
**kwargs,
):
"""Convert text to speech.
@ -330,6 +347,12 @@ class TTS(nn.Module):
speed (float, optional):
Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
Defaults to None.
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
kwargs (dict, optional):
Additional arguments for the model.
"""
self._check_arguments(
speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
@ -347,6 +370,7 @@ class TTS(nn.Module):
style_wav=None,
style_text=None,
reference_speaker_name=None,
split_sentences=split_sentences,
**kwargs,
)
return wav
@ -361,6 +385,7 @@ class TTS(nn.Module):
speed: float = 1.0,
pipe_out=None,
file_path: str = "output.wav",
split_sentences: bool = True,
**kwargs,
):
"""Convert text to speech.
@ -385,6 +410,10 @@ class TTS(nn.Module):
Flag to stdout the generated TTS wav file for shell pipe.
file_path (str, optional):
Output file path. Defaults to "output.wav".
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
kwargs (dict, optional):
Additional arguments for the model.
"""
@ -400,7 +429,14 @@ class TTS(nn.Module):
file_path=file_path,
pipe_out=pipe_out,
)
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
wav = self.tts(
text=text,
speaker=speaker,
language=language,
speaker_wav=speaker_wav,
split_sentences=split_sentences,
**kwargs,
)
self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
return file_path
@ -440,7 +476,14 @@ class TTS(nn.Module):
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
return file_path
def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None, speaker: str = None):
def tts_with_vc(
self,
text: str,
language: str = None,
speaker_wav: str = None,
speaker: str = None,
split_sentences: bool = True,
):
"""Convert text to speech with voice conversion.
It combines tts with voice conversion to fake voice cloning.
@ -460,10 +503,16 @@ class TTS(nn.Module):
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
"""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
# Lazy code... save it to a temp file to resample it while reading it for VC
self.tts_to_file(text=text, speaker=speaker, language=language, file_path=fp.name)
self.tts_to_file(
text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
)
if self.voice_converter is None:
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
@ -476,6 +525,7 @@ class TTS(nn.Module):
speaker_wav: str = None,
file_path: str = "output.wav",
speaker: str = None,
split_sentences: bool = True,
):
"""Convert text to speech with voice conversion and save to file.
@ -495,6 +545,12 @@ class TTS(nn.Module):
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
"""
wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav, speaker=speaker)
wav = self.tts_with_vc(
text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
)
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)

View File

@ -319,7 +319,7 @@ class GPTTrainer(BaseTTS):
return self.train_step(batch, criterion)
def on_train_epoch_start(self, trainer):
trainer.model.eval() # the whole model to eval
trainer.model.eval() # the whole model to eval
# put gpt model in training mode
trainer.model.xtts.gpt.train()

View File

@ -15,7 +15,6 @@ class PuncPosition(Enum):
BEGIN = 0
END = 1
MIDDLE = 2
ALONE = 3
class Punctuation:
@ -92,7 +91,7 @@ class Punctuation:
return [text], []
# the text is only punctuations
if len(matches) == 1 and matches[0].group() == text:
return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
return [], [_PUNC_IDX(text, PuncPosition.BEGIN)]
# build a punctuation map to be used later to restore punctuations
puncs = []
for match in matches:
@ -107,11 +106,14 @@ class Punctuation:
for idx, punc in enumerate(puncs):
split = text.split(punc.punc)
prefix, suffix = split[0], punc.punc.join(split[1:])
text = suffix
if prefix == "":
# We don't want to insert an empty string in case of initial punctuation
continue
splitted_text.append(prefix)
# if the text does not end with a punctuation, add it to the last item
if idx == len(puncs) - 1 and len(suffix) > 0:
splitted_text.append(suffix)
text = suffix
return splitted_text, puncs
@classmethod
@ -127,10 +129,10 @@ class Punctuation:
['This is', 'example'], ['.', '!'] -> "This is. example!"
"""
return cls._restore(text, puncs, 0)
return cls._restore(text, puncs)
@classmethod
def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements
def _restore(cls, text, puncs): # pylint: disable=too-many-return-statements
"""Auxiliary method for Punctuation.restore()"""
if not puncs:
return text
@ -142,21 +144,18 @@ class Punctuation:
current = puncs[0]
if current.position == PuncPosition.BEGIN:
return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num)
return cls._restore([current.punc + text[0]] + text[1:], puncs[1:])
if current.position == PuncPosition.END:
return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
if current.position == PuncPosition.ALONE:
return [current.mark] + cls._restore(text, puncs[1:], num + 1)
return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:])
# POSITION == MIDDLE
if len(text) == 1: # pragma: nocover
# a corner case where the final part of an intermediate
# mark (I) has not been phonemized
return cls._restore([text[0] + current.punc], puncs[1:], num)
return cls._restore([text[0] + current.punc], puncs[1:])
return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:])
# if __name__ == "__main__":

View File

@ -1,5 +1,6 @@
import json
import os
import re
import tarfile
import zipfile
from pathlib import Path
@ -26,7 +27,6 @@ LICENSE_URLS = {
}
class ModelManager(object):
tqdm_progress = None
"""Manage TTS models defined in .models.json.
@ -276,13 +276,15 @@ class ModelManager(object):
model_item["model_url"] = model_item["hf_url"]
elif "fairseq" in model_item["model_name"]:
model_item["model_url"] = "https://coqui.gateway.scarf.sh/fairseq/"
elif "xtts" in model_item["model_name"]:
model_item["model_url"] = "https://coqui.gateway.scarf.sh/xtts/"
return model_item
def _set_model_item(self, model_name):
# fetch model info from the dict
model_type, lang, dataset, model = model_name.split("/")
model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
if "fairseq" in model_name:
model_type = "tts_models"
lang = model_name.split("/")[1]
model_item = {
"model_type": "tts_models",
"license": "CC BY-NC 4.0",
@ -291,10 +293,37 @@ class ModelManager(object):
"description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.",
}
model_item["model_name"] = model_name
elif "xtts" in model_name and len(model_name.split("/")) != 4:
# loading xtts models with only model name (e.g. xtts_v2.0.2)
# check model name has the version number with regex
version_regex = r"v\d+\.\d+\.\d+"
if re.search(version_regex, model_name):
model_version = model_name.split("_")[-1]
else:
model_version = "main"
model_type = "tts_models"
lang = "multilingual"
dataset = "multi-dataset"
model = model_name
model_item = {
"default_vocoder": None,
"license": "CPML",
"contact": "info@coqui.ai",
"tos_required": True,
"hf_url": [
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/model.pth",
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json",
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json",
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5",
],
}
else:
# get model from models.json
model_type, lang, dataset, model = model_name.split("/")
model_item = self.models_dict[model_type][lang][dataset][model]
model_item["model_type"] = model_type
model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
md5hash = model_item["model_hash"] if "model_hash" in model_item else None
model_item = self.set_model_url(model_item)
return model_item, model_full_name, model, md5hash

View File

@ -264,6 +264,7 @@ class Synthesizer(nn.Module):
style_text=None,
reference_wav=None,
reference_speaker_name=None,
split_sentences: bool = True,
**kwargs,
) -> List[int]:
"""🐸 TTS magic. Run all the models and generate speech.
@ -277,6 +278,8 @@ class Synthesizer(nn.Module):
style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
split_sentences (bool, optional): split the input text into sentences. Defaults to True.
**kwargs: additional arguments to pass to the TTS model.
Returns:
List[int]: [description]
"""
@ -289,8 +292,10 @@ class Synthesizer(nn.Module):
)
if text:
sens = self.split_into_sentences(text)
print(" > Text splitted to sentences.")
sens = [text]
if split_sentences:
print(" > Text splitted to sentences.")
sens = self.split_into_sentences(text)
print(sens)
# handle multi-speaker

View File

@ -39,6 +39,10 @@ You can also mail us at info@coqui.ai.
#### 🐸TTS API
##### Single reference
Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio.
You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit.
```python
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
@ -47,14 +51,29 @@ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
file_path="output.wav",
speaker_wav=["/path/to/target/speaker.wav"],
language="en")
language="en",
split_sentences=True
)
```
##### Multiple references
You can pass multiple audio files to the `speaker_wav` argument for better voice cloning.
```python
from TTS.api import TTS
# using the default version set in 🐸TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
# using a specific version
# 👀 see the branch names for versions on https://huggingface.co/coqui/XTTS-v2/tree/main
# ❗some versions might be incompatible with the API
tts = TTS("xtts_v2.0.2", gpu=True)
# getting the latest XTTS_v2
tts = TTS("xtts", gpu=True)
# generate speech by cloning a voice using default settings
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
file_path="output.wav",
@ -62,6 +81,42 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
language="en")
```
##### Streaming inference
XTTS supports streaming inference. This is useful for real-time applications.
```python
import os
import time
import torch
import torchaudio
print("Loading model...")
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
model = tts.synthesizer.tts_model
print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
print("Inference...")
t0 = time.time()
stream_generator = model.inference_stream(
"It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
"en",
gpt_cond_latent,
speaker_embedding
)
wav_chuncks = []
for i, chunk in enumerate(stream_generator):
if i == 0:
print(f"Time to first chunck: {time.time() - t0}")
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
wav_chuncks.append(chunk)
wav = torch.cat(wav_chuncks, dim=0)
torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
```
#### 🐸TTS Command line
##### Single reference
@ -91,10 +146,13 @@ or for all wav files in a directory you can use:
--use_cuda true
```
#### 🐸TTS Model API
#### model directly
To use the model API, you need to download the model files and pass config and model file paths manually.
If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
##### Calling manually
If you want to be able to run with `use_deepspeed=True` and **enjoy the speedup**, you need to install deepspeed first.
```console
pip install deepspeed==0.10.3
@ -129,7 +187,7 @@ torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
```
#### streaming inference
##### Streaming manually
Here the goal is to stream the audio as it is being generated. This is useful for real-time applications.
Streaming inference is typically slower than regular inference, but it allows to get a first chunk of audio faster.

View File

@ -11,6 +11,11 @@ class PunctuationTest(unittest.TestCase):
("This, is my text ... to be striped !! from text", "This is my text to be striped from text"),
("This, is my text ... to be striped from text?", "This is my text to be striped from text"),
("This, is my text to be striped from text", "This is my text to be striped from text"),
(".", ""),
(" . ", ""),
("!!! Attention !!!", "Attention"),
("!!! Attention !!! This is just a ... test.", "Attention This is just a test"),
("!!! Attention! This is just a ... test.", "Attention This is just a test"),
]
def test_get_set_puncs(self):