mirror of https://github.com/coqui-ai/TTS.git
Python API implementation (#2195)
* Draft implementation * Fix style * Add api tests * Fix lint * Update docs * Update tests * Set env * Fixup * Fixup * Fix lint * Revertpull/2204/head
parent
fdeefcc612
commit
1ddc484b49
|
@ -31,6 +31,8 @@ jobs:
|
|||
cache-dependency-path: 'requirements*'
|
||||
- name: check OS
|
||||
run: cat /etc/os-release
|
||||
- name: set ENV
|
||||
run: export TRAINER_TELEMETRY=0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
|
|
@ -31,6 +31,8 @@ jobs:
|
|||
cache-dependency-path: 'requirements*'
|
||||
- name: check OS
|
||||
run: cat /etc/os-release
|
||||
- name: set ENV
|
||||
run: export TRAINER_TELEMETRY=0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
|
|
@ -31,10 +31,13 @@ jobs:
|
|||
cache-dependency-path: 'requirements*'
|
||||
- name: check OS
|
||||
run: cat /etc/os-release
|
||||
- name: set ENV
|
||||
run: export TRAINER_TELEMETRY=0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends git make gcc
|
||||
sudo apt-get install espeak-ng
|
||||
make system-deps
|
||||
- name: Install/upgrade Python setup deps
|
||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
||||
|
|
|
@ -31,6 +31,8 @@ jobs:
|
|||
cache-dependency-path: 'requirements*'
|
||||
- name: check OS
|
||||
run: cat /etc/os-release
|
||||
- name: set ENV
|
||||
run: export TRAINER_TELEMETRY=0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
|
|
@ -31,6 +31,8 @@ jobs:
|
|||
cache-dependency-path: 'requirements*'
|
||||
- name: check OS
|
||||
run: cat /etc/os-release
|
||||
- name: set ENV
|
||||
run: export TRAINER_TELEMETRY=0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
|
|
@ -31,6 +31,8 @@ jobs:
|
|||
cache-dependency-path: 'requirements*'
|
||||
- name: check OS
|
||||
run: cat /etc/os-release
|
||||
- name: set ENV
|
||||
run: export TRAINER_TELEMETRY=0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
|
|
@ -31,6 +31,8 @@ jobs:
|
|||
cache-dependency-path: 'requirements*'
|
||||
- name: check OS
|
||||
run: cat /etc/os-release
|
||||
- name: set ENV
|
||||
run: export TRAINER_TELEMETRY=0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
|
|
@ -31,6 +31,8 @@ jobs:
|
|||
cache-dependency-path: 'requirements*'
|
||||
- name: check OS
|
||||
run: cat /etc/os-release
|
||||
- name: set ENV
|
||||
run: export TRAINER_TELEMETRY=0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
|
|
@ -31,6 +31,8 @@ jobs:
|
|||
cache-dependency-path: 'requirements*'
|
||||
- name: check OS
|
||||
run: cat /etc/os-release
|
||||
- name: set ENV
|
||||
run: export TRAINER_TELEMETRY=0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
|
|
@ -0,0 +1,146 @@
|
|||
from pathlib import Path
|
||||
|
||||
from TTS.utils.manage import ModelManager
|
||||
from TTS.utils.synthesizer import Synthesizer
|
||||
|
||||
|
||||
class TTS:
|
||||
"""TODO: Add voice conversion and Capacitron support."""
|
||||
|
||||
def __init__(self, model_name: str = None, progress_bar: bool = True, gpu=False):
|
||||
"""🐸TTS python interface that allows to load and use the released models.
|
||||
|
||||
Example with a multi-speaker model:
|
||||
>>> from TTS.api import TTS
|
||||
>>> tts = TTS(TTS.list_models()[0])
|
||||
>>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
|
||||
>>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
|
||||
|
||||
Example with a single-speaker model:
|
||||
>>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
|
||||
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
|
||||
|
||||
Args:
|
||||
model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
|
||||
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
|
||||
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
||||
"""
|
||||
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
|
||||
self.synthesizer = None
|
||||
if model_name:
|
||||
self.load_model_by_name(model_name, gpu)
|
||||
|
||||
@property
|
||||
def models(self):
|
||||
return self.manager.list_tts_models()
|
||||
|
||||
@property
|
||||
def is_multi_speaker(self):
|
||||
if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
|
||||
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
|
||||
return False
|
||||
|
||||
@property
|
||||
def is_multi_lingual(self):
|
||||
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
|
||||
return self.synthesizer.tts_model.language_manager.num_languages > 1
|
||||
return False
|
||||
|
||||
@property
|
||||
def speakers(self):
|
||||
if not self.is_multi_speaker:
|
||||
return None
|
||||
return self.synthesizer.tts_model.speaker_manager.speaker_names
|
||||
|
||||
@property
|
||||
def languages(self):
|
||||
if not self.is_multi_lingual:
|
||||
return None
|
||||
return self.synthesizer.tts_model.language_manager.language_names
|
||||
|
||||
@staticmethod
|
||||
def get_models_file_path():
|
||||
return Path(__file__).parent / ".models.json"
|
||||
|
||||
@staticmethod
|
||||
def list_models():
|
||||
manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
|
||||
return manager.list_tts_models()
|
||||
|
||||
def download_model_by_name(self, model_name: str):
|
||||
model_path, config_path, model_item = self.manager.download_model(model_name)
|
||||
if model_item["default_vocoder"] is None:
|
||||
return model_path, config_path, None, None
|
||||
vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
|
||||
return model_path, config_path, vocoder_path, vocoder_config_path
|
||||
|
||||
def load_model_by_name(self, model_name: str, gpu: bool = False):
|
||||
model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
|
||||
# init synthesizer
|
||||
# None values are fetch from the model
|
||||
self.synthesizer = Synthesizer(
|
||||
tts_checkpoint=model_path,
|
||||
tts_config_path=config_path,
|
||||
tts_speakers_file=None,
|
||||
tts_languages_file=None,
|
||||
vocoder_checkpoint=vocoder_path,
|
||||
vocoder_config=vocoder_config_path,
|
||||
encoder_checkpoint=None,
|
||||
encoder_config=None,
|
||||
use_cuda=gpu,
|
||||
)
|
||||
|
||||
def _check_arguments(self, speaker: str = None, language: str = None):
|
||||
if self.is_multi_speaker and speaker is None:
|
||||
raise ValueError("Model is multi-speaker but no speaker is provided.")
|
||||
if self.is_multi_lingual and language is None:
|
||||
raise ValueError("Model is multi-lingual but no language is provided.")
|
||||
if not self.is_multi_speaker and speaker is not None:
|
||||
raise ValueError("Model is not multi-speaker but speaker is provided.")
|
||||
if not self.is_multi_lingual and language is not None:
|
||||
raise ValueError("Model is not multi-lingual but language is provided.")
|
||||
|
||||
def tts(self, text: str, speaker: str = None, language: str = None):
|
||||
"""Convert text to speech.
|
||||
|
||||
Args:
|
||||
text (str):
|
||||
Input text to synthesize.
|
||||
speaker (str, optional):
|
||||
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
||||
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
||||
language (str, optional):
|
||||
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
|
||||
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
|
||||
"""
|
||||
self._check_arguments(speaker=speaker, language=language)
|
||||
|
||||
wav = self.synthesizer.tts(
|
||||
text=text,
|
||||
speaker_name=speaker,
|
||||
language_name=language,
|
||||
speaker_wav=None,
|
||||
reference_wav=None,
|
||||
style_wav=None,
|
||||
style_text=None,
|
||||
reference_speaker_name=None,
|
||||
)
|
||||
return wav
|
||||
|
||||
def tts_to_file(self, text: str, speaker: str = None, language: str = None, file_path: str = "output.wav"):
|
||||
"""Convert text to speech.
|
||||
|
||||
Args:
|
||||
text (str):
|
||||
Input text to synthesize.
|
||||
speaker (str, optional):
|
||||
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
||||
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
||||
language (str, optional):
|
||||
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
|
||||
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
|
||||
file_path (str, optional):
|
||||
Output file path. Defaults to "output.wav".
|
||||
"""
|
||||
wav = self.tts(text=text, speaker=speaker, language=language)
|
||||
self.synthesizer.save_wav(wav=wav, path=file_path)
|
|
@ -35,11 +35,13 @@ class ModelManager(object):
|
|||
models_file (str): path to .model.json file. Defaults to None.
|
||||
output_prefix (str): prefix to `tts` to download models. Defaults to None
|
||||
progress_bar (bool): print a progress bar when donwloading a file. Defaults to False.
|
||||
verbose (bool): print info. Defaults to True.
|
||||
"""
|
||||
|
||||
def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
|
||||
def __init__(self, models_file=None, output_prefix=None, progress_bar=False, verbose=True):
|
||||
super().__init__()
|
||||
self.progress_bar = progress_bar
|
||||
self.verbose = verbose
|
||||
if output_prefix is None:
|
||||
self.output_prefix = get_user_data_dir("tts")
|
||||
else:
|
||||
|
@ -62,30 +64,31 @@ class ModelManager(object):
|
|||
self.models_dict = json.load(json_file)
|
||||
|
||||
def _list_models(self, model_type, model_count=0):
|
||||
if self.verbose:
|
||||
print(" Name format: type/language/dataset/model")
|
||||
model_list = []
|
||||
for lang in self.models_dict[model_type]:
|
||||
for dataset in self.models_dict[model_type][lang]:
|
||||
for model in self.models_dict[model_type][lang][dataset]:
|
||||
model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
|
||||
output_path = os.path.join(self.output_prefix, model_full_name)
|
||||
if os.path.exists(output_path):
|
||||
print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]")
|
||||
else:
|
||||
print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}")
|
||||
if self.verbose:
|
||||
if os.path.exists(output_path):
|
||||
print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]")
|
||||
else:
|
||||
print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}")
|
||||
model_list.append(f"{model_type}/{lang}/{dataset}/{model}")
|
||||
model_count += 1
|
||||
return model_list
|
||||
|
||||
def _list_for_model_type(self, model_type):
|
||||
print(" Name format: language/dataset/model")
|
||||
models_name_list = []
|
||||
model_count = 1
|
||||
model_type = "tts_models"
|
||||
models_name_list.extend(self._list_models(model_type, model_count))
|
||||
return [name.replace(model_type + "/", "") for name in models_name_list]
|
||||
return models_name_list
|
||||
|
||||
def list_models(self):
|
||||
print(" Name format: type/language/dataset/model")
|
||||
models_name_list = []
|
||||
model_count = 1
|
||||
for model_type in self.models_dict:
|
||||
|
|
|
@ -62,7 +62,6 @@ class Synthesizer(object):
|
|||
self.tts_model = None
|
||||
self.vocoder_model = None
|
||||
self.speaker_manager = None
|
||||
self.num_speakers = 0
|
||||
self.tts_speakers = {}
|
||||
self.language_manager = None
|
||||
self.num_languages = 0
|
||||
|
|
|
@ -11,6 +11,7 @@ After the installation, 2 terminal commands are available.
|
|||
|
||||
1. TTS Command Line Interface (CLI). - `tts`
|
||||
2. Local Demo Server. - `tts-server`
|
||||
3. In 🐍Python. - `from TTS.api import TTS`
|
||||
|
||||
## On the Commandline - `tts`
|
||||

|
||||
|
@ -99,5 +100,30 @@ tts-server --model_name "<type>/<language>/<dataset>/<model_name>" \
|
|||
--vocoder_name "<type>/<language>/<dataset>/<model_name>"
|
||||
```
|
||||
|
||||
## TorchHub
|
||||
You can also use [this simple colab notebook](https://colab.research.google.com/drive/1iAe7ZdxjUIuN6V4ooaCt0fACEGKEn7HW?usp=sharing) using TorchHub to synthesize speech.
|
||||
## Python API
|
||||
|
||||
You can run a multi-speaker and multi-lingual model in Python as
|
||||
|
||||
```python
|
||||
from TTS.api import TTS
|
||||
|
||||
# List available 🐸TTS models and choose the first one
|
||||
model_name = TTS.list_models()[0]
|
||||
# Init TTS
|
||||
tts = TTS(model_name)
|
||||
# Run TTS
|
||||
# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language
|
||||
# Text to speech with a numpy output
|
||||
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
|
||||
# Text to speech to a file
|
||||
tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
|
||||
```
|
||||
|
||||
Here is an example for a single speaker model.
|
||||
|
||||
```python
|
||||
# Init TTS with the target model name
|
||||
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
|
||||
# Run TTS
|
||||
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
|
||||
```
|
|
@ -0,0 +1,36 @@
|
|||
import os
|
||||
import unittest
|
||||
|
||||
from tests import get_tests_output_path
|
||||
from TTS.api import TTS
|
||||
|
||||
OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
|
||||
|
||||
|
||||
class TTSTest(unittest.TestCase):
|
||||
def test_single_speaker_model(self):
|
||||
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
|
||||
|
||||
error_raised = False
|
||||
try:
|
||||
tts.tts_to_file(text="Ich bin eine Testnachricht.", speaker="Thorsten", language="de")
|
||||
except ValueError:
|
||||
error_raised = True
|
||||
|
||||
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
|
||||
|
||||
self.assertTrue(error_raised)
|
||||
self.assertFalse(tts.is_multi_speaker)
|
||||
self.assertFalse(tts.is_multi_lingual)
|
||||
self.assertIsNone(tts.speakers)
|
||||
self.assertIsNone(tts.languages)
|
||||
|
||||
def test_multi_speaker_multi_lingual_model(self):
|
||||
tts = TTS()
|
||||
tts.load_model_by_name(tts.models[0]) # YourTTS
|
||||
tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path=OUTPUT_PATH)
|
||||
|
||||
self.assertTrue(tts.is_multi_speaker)
|
||||
self.assertTrue(tts.is_multi_lingual)
|
||||
self.assertGreater(len(tts.speakers), 1)
|
||||
self.assertGreater(len(tts.languages), 1)
|
Loading…
Reference in New Issue