remove all espeaker and phonemizer deps

pull/488/head
Eren Gölge 2021-05-18 15:07:25 +02:00
parent ced05e812a
commit d7fae3f515
9 changed files with 3 additions and 1973 deletions

View File

@ -1,7 +1,6 @@
#!/bin/bash
yes | apt-get install sox
yes | apt-get install ffmpeg
yes | apt-get install espeak
yes | apt-get install tmux
yes | apt-get install zsh
sh -c "$(curl -fsSL https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh)"

View File

@ -7,7 +7,6 @@ help:
target_dirs := tests TTS notebooks
system-deps: ## install linux system deps
sudo apt-get install -y espeak-ng
sudo apt-get install -y libsndfile1-dev
dev-deps: ## install development deps

View File

@ -22,42 +22,3 @@ Run the server with the official models on a GPU.
Run the server with a custom models.
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json```
<!-- ##### Using .whl
1. apt-get install -y espeak libsndfile1 python3-venv
2. python3 -m venv /tmp/venv
3. source /tmp/venv/bin/activate
4. pip install -U pip setuptools wheel
5. pip install -U https//example.com/url/to/python/package.whl
6. python -m TTS.server.server
You can now open http://localhost:5002 in a browser -->
<!-- #### Running with nginx/uwsgi:
**Note:** This method uses an old TTS model, so quality might be low.
1. apt-get install -y uwsgi uwsgi-plugin-python3 nginx espeak libsndfile1 python3-venv
2. python3 -m venv /tmp/venv
3. source /tmp/venv/bin/activate
4. pip install -U pip setuptools wheel
5. pip install -U https//example.com/url/to/python/package.whl
6. curl -LO https://github.com/reuben/TTS/releases/download/t2-ljspeech-mold/t2-ljspeech-mold-nginx-uwsgi.zip
7. unzip *-nginx-uwsgi.zip
8. cp tts_site_nginx /etc/nginx/sites-enabled/default
9. service nginx restart
10. uwsgi --ini uwsgi.ini
You can now open http://localhost:80 in a browser (edit the port in /etc/nginx/sites-enabled/tts_site_nginx).
Configure number of workers (number of requests that will be processed in parallel) by editing the `uwsgi.ini` file, specifically the `processes` setting. -->
<!-- #### Creating a server package with an embedded model
[setup.py](../setup.py) was extended with two new parameters when running the `bdist_wheel` command:
- `--checkpoint <path to checkpoint file>` - path to model checkpoint file you want to embed in the package
- `--model_config <path to config.json file>` - path to corresponding config.json file for the checkpoint
To create a package, run `python setup.py bdist_wheel --checkpoint /path/to/checkpoint --model_config /path/to/config.json`.
A Python `.whl` file will be created in the `dist/` folder with the checkpoint and config embedded in it. -->

View File

@ -2,9 +2,7 @@
import re
import phonemizer
from packaging import version
from phonemizer.phonemize import phonemize
from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
from TTS.tts.utils.text import cleaners
@ -28,9 +26,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
def text2phone(text, language):
"""Convert graphemes to phonemes. For most of the languages, it calls
the phonemizer python library that calls espeak/espeak-ng. For chinese
mandarin, it calls pypinyin + custom function for phonemizing
"""Convert graphemes to phonemes.
Parameters:
text (str): text to phonemize
language (str): language of the text
@ -43,47 +39,7 @@ def text2phone(text, language):
if language == "zh-CN":
ph = chinese_text_to_phonemes(text)
return ph
seperator = phonemizer.separator.Separator(" |", "", "|")
# try:
punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
if version.parse(phonemizer.__version__) < version.parse("2.1"):
ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend="espeak", language=language)
ph = ph[:-1].strip() # skip the last empty character
# phonemizer does not tackle punctuations. Here we do.
# Replace \n with matching punctuations.
if punctuations:
# if text ends with a punctuation.
if text[-1] == punctuations[-1]:
for punct in punctuations[:-1]:
ph = ph.replace("| |\n", "|" + punct + "| |", 1)
ph = ph + punctuations[-1]
else:
for punct in punctuations:
ph = ph.replace("| |\n", "|" + punct + "| |", 1)
elif version.parse(phonemizer.__version__) >= version.parse("2.1"):
ph = phonemize(
text,
separator=seperator,
strip=False,
njobs=1,
backend="espeak",
language=language,
preserve_punctuation=True,
language_switch="remove-flags",
)
# this is a simple fix for phonemizer.
# https://github.com/bootphon/phonemizer/issues/32
if punctuations:
for punctuation in punctuations:
ph = ph.replace(f"| |{punctuation} ", f"|{punctuation}| |").replace(
f"| |{punctuation}", f"|{punctuation}| |"
)
ph = ph[:-3]
else:
raise RuntimeError(" [!] Use 'phonemizer' version 2.1 or older.")
return ph
raise ValueError(f" [!] Language {language} is nor supported for phonemization.")
def intersperse(sequence, token):

View File

@ -1,6 +1,6 @@
dependencies = [
'torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode', 'pypinyin'
] # apt install espeak-ng
]
import torch
from TTS.utils.manage import ModelManager

View File

@ -1,387 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "6LWsNd3_M3MP"
},
"source": [
"# Mozilla TTS on CPU Real-Time Speech Synthesis with TFLite"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "FAqrSIWgLyP0"
},
"source": [
"**These models are converted from released [PyTorch models](https://colab.research.google.com/drive/1u_16ZzHjKYFn1HNVuA4Qf_i2MMFB9olY?usp=sharing) using our TF utilities provided in Mozilla TTS.**\n",
"\n",
"#### **Notebook Details**\n",
"These TFLite models support TF 2.3rc0 and for different versions you might need to regenerate them. \n",
"\n",
"TFLite optimizations degrades the TTS model performance and we do not apply\n",
"any optimization for the vocoder model due to the same reason. If you like to\n",
"keep the quality, consider to regenerate TFLite model accordingly.\n",
"\n",
"Models optimized with TFLite can be slow on a regular CPU since it is optimized\n",
"specifically for lower-end systems.\n",
"\n",
"---\n",
"\n",
"\n",
"\n",
"#### **Model Details** \n",
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
"\n",
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
"\n",
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
"\n",
"Note that both model performances can be improved with more training.\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Ku-dA4DKoeXk"
},
"source": [
"### Download TF Models and configs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 162
},
"colab_type": "code",
"id": "jGIgnWhGsxU1",
"outputId": "57af701e-77ec-400d-fee5-64aa7603d357"
},
"outputs": [],
"source": [
"!gdown --id 17PYXCmTe0el_SLTwznrt3vOArNGMGo5v -O tts_model.tflite\n",
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O config.json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 235
},
"colab_type": "code",
"id": "4dnpE0-kvTsu",
"outputId": "6aab0622-9add-4ee4-b9f8-177d6ddc0e86"
},
"outputs": [],
"source": [
"!gdown --id 1aXveT-NjOM1mUr6tM4JfWjshq67GvVIO -O vocoder_model.tflite\n",
"!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O config_vocoder.json\n",
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O scale_stats.npy"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "_ZuDrj_ioqHE"
},
"source": [
"### Setup Libraries"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 964
},
"colab_type": "code",
"id": "X2axt5BYq7gv",
"outputId": "aa53986f-f218-4d17-8667-0d74bb90c927"
},
"outputs": [],
"source": [
"# need it for char to phoneme conversion\n",
"! sudo apt-get install espeak"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 144
},
"colab_type": "code",
"id": "ZduAf-qYYEIT",
"outputId": "c1fcac0d-b8f8-442c-d598-4f549c42b698"
},
"outputs": [],
"source": [
"!git clone https://github.com/mozilla/TTS"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"colab_type": "code",
"id": "ofPCvPyjZEcT",
"outputId": "f3d3ea73-eae5-473c-db19-276bd0e721cc"
},
"outputs": [],
"source": [
"%cd TTS\n",
"!git checkout c7296b3\n",
"!pip install -r requirements.txt\n",
"!python setup.py install\n",
"!pip install tensorflow==2.3.0rc0\n",
"%cd .."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Zlgi8fPdpRF0"
},
"source": [
"### Define TTS function"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "f-Yc42nQZG5A"
},
"outputs": [],
"source": [
"def run_vocoder(mel_spec):\n",
" vocoder_inputs = mel_spec[None, :, :]\n",
" # get input and output details\n",
" input_details = vocoder_model.get_input_details()\n",
" # reshape input tensor for the new input shape\n",
" vocoder_model.resize_tensor_input(input_details[0]['index'], vocoder_inputs.shape)\n",
" vocoder_model.allocate_tensors()\n",
" detail = input_details[0]\n",
" vocoder_model.set_tensor(detail['index'], vocoder_inputs)\n",
" # run the model\n",
" vocoder_model.invoke()\n",
" # collect outputs\n",
" output_details = vocoder_model.get_output_details()\n",
" waveform = vocoder_model.get_tensor(output_details[0]['index'])\n",
" return waveform \n",
"\n",
"\n",
"def tts(model, text, CONFIG, p):\n",
" t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n",
" backend='tflite')\n",
" waveform = run_vocoder(mel_postnet_spec.T)\n",
" waveform = waveform[0, 0]\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" tps = (time.time() - t_1) / len(waveform)\n",
" print(waveform.shape)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" print(\" > Time per step: {}\".format(tps))\n",
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
" return alignment, mel_postnet_spec, stop_tokens, waveform"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ZksegYQepkFg"
},
"source": [
"### Load TF Models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "oVa0kOamprgj"
},
"outputs": [],
"source": [
"import os\n",
"import torch\n",
"import time\n",
"import IPython\n",
"\n",
"from TTS.tf.utils.tflite import load_tflite_model\n",
"from TTS.tf.utils.io import load_checkpoint\n",
"from TTS.utils.io import load_config\n",
"from TTS.utils.text.symbols import symbols, phonemes\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.tts.utils.synthesis import synthesis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "EY-sHVO8IFSH"
},
"outputs": [],
"source": [
"# runtime settings\n",
"use_cuda = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "_1aIUp2FpxOQ"
},
"outputs": [],
"source": [
"# model paths\n",
"TTS_MODEL = \"tts_model.tflite\"\n",
"TTS_CONFIG = \"config.json\"\n",
"VOCODER_MODEL = \"vocoder_model.tflite\"\n",
"VOCODER_CONFIG = \"config_vocoder.json\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "CpgmdBVQplbv"
},
"outputs": [],
"source": [
"# load configs\n",
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 471
},
"colab_type": "code",
"id": "zmrQxiozIUVE",
"outputId": "ca7e9016-4c28-4cef-efe7-0613d399aa4c"
},
"outputs": [],
"source": [
"# load the audio processor\n",
"ap = AudioProcessor(**TTS_CONFIG.audio) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "8fLoI4ipqMeS"
},
"outputs": [],
"source": [
"# LOAD TTS MODEL\n",
"# multi speaker \n",
"speaker_id = None\n",
"speakers = []\n",
"\n",
"# load the models\n",
"model = load_tflite_model(TTS_MODEL)\n",
"vocoder_model = load_tflite_model(VOCODER_MODEL)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Ws_YkPKsLgo-"
},
"source": [
"## Run Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 134
},
"colab_type": "code",
"id": "FuWxZ9Ey5Puj",
"outputId": "d1888ebd-3208-42a4-aaf9-78d0e3ec987d"
},
"outputs": [],
"source": [
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "DDC-TTS_and_MultiBand-MelGAN_TFLite_Example.ipynb",
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,650 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "yZK6UdwSFnOO"
},
"source": [
"# **Download and install Coqui TTS**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "yvb0pX3WY6MN"
},
"outputs": [],
"source": [
"import os \n",
"!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "iB9nl2UEG3SY"
},
"outputs": [],
"source": [
"!apt-get install espeak\n",
"os.chdir('TTS')\n",
"!pip install -r requirements.txt\n",
"!python setup.py develop\n",
"os.chdir('..')"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "w6Krn8k1inC_"
},
"source": [
"\n",
"\n",
"**Download Checkpoint**\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "PiYHf3lKhi9z"
},
"outputs": [],
"source": [
"!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018.zip\n",
"!unzip ./TTS-checkpoint.zip\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "MpYNgqrZcJKn"
},
"source": [
"**Utils Functions**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "4KZA4b_CbMqx"
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import argparse\n",
"import json\n",
"# pylint: disable=redefined-outer-name, unused-argument\n",
"import os\n",
"import string\n",
"import time\n",
"import sys\n",
"import numpy as np\n",
"\n",
"TTS_PATH = \"../content/TTS\"\n",
"# add libraries into environment\n",
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
"\n",
"import torch\n",
"\n",
"from TTS.tts.utils.generic_utils import setup_model\n",
"from TTS.tts.utils.synthesis import synthesis\n",
"from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.io import load_config\n",
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
"\n",
"\n",
"def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):\n",
" t_1 = time.time()\n",
" waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" if use_cuda and not use_gl:\n",
" waveform = waveform.cpu()\n",
" if not use_gl:\n",
" waveform = waveform.numpy()\n",
" waveform = waveform.squeeze()\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" tps = (time.time() - t_1) / len(waveform)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" print(\" > Time per step: {}\".format(tps))\n",
" return waveform\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ENA2OumIVeMA"
},
"source": [
"# **Vars definitions**\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "jPD0d_XpVXmY"
},
"outputs": [],
"source": [
"TEXT = ''\n",
"OUT_PATH = 'tests-audios/'\n",
"# create output path\n",
"os.makedirs(OUT_PATH, exist_ok=True)\n",
"\n",
"SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n",
"\n",
"# model vars \n",
"MODEL_PATH = 'best_model.pth.tar'\n",
"CONFIG_PATH = 'config.json'\n",
"SPEAKER_JSON = 'speakers.json'\n",
"\n",
"# vocoder vars\n",
"VOCODER_PATH = ''\n",
"VOCODER_CONFIG_PATH = ''\n",
"\n",
"USE_CUDA = True"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "dV6cXXlfi72r"
},
"source": [
"# **Restore TTS Model**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "x1WgLFauWUPe"
},
"outputs": [],
"source": [
"# load the config\n",
"C = load_config(CONFIG_PATH)\n",
"C.forward_attn_mask = True\n",
"\n",
"# load the audio processor\n",
"ap = AudioProcessor(**C.audio)\n",
"\n",
"# if the vocabulary was passed, replace the default\n",
"if 'characters' in C.keys():\n",
" symbols, phonemes = make_symbols(**C.characters)\n",
"\n",
"speaker_embedding = None\n",
"speaker_embedding_dim = None\n",
"num_speakers = 0\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" num_speakers = len(speaker_mapping)\n",
" if C.use_external_speaker_embedding_file:\n",
" if SPEAKER_FILEID is not None:\n",
" speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n",
" else: # if speaker_fileid is not specificated use the first sample in speakers.json\n",
" choise_speaker = list(speaker_mapping.keys())[0]\n",
" print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n",
" speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n",
" speaker_embedding_dim = len(speaker_embedding)\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n",
"cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
"model.load_state_dict(cp['model'])\n",
"model.eval()\n",
"\n",
"if USE_CUDA:\n",
" model.cuda()\n",
"\n",
"model.decoder.set_r(cp['r'])\n",
"\n",
"# load vocoder model\n",
"if VOCODER_PATH!= \"\":\n",
" VC = load_config(VOCODER_CONFIG_PATH)\n",
" vocoder_model = setup_generator(VC)\n",
" vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n",
" vocoder_model.remove_weight_norm()\n",
" if USE_CUDA:\n",
" vocoder_model.cuda()\n",
" vocoder_model.eval()\n",
"else:\n",
" vocoder_model = None\n",
" VC = None\n",
"\n",
"# synthesize voice\n",
"use_griffin_lim = VOCODER_PATH== \"\"\n",
"\n",
"if not C.use_external_speaker_embedding_file:\n",
" if SPEAKER_FILEID.isdigit():\n",
" SPEAKER_FILEID = int(SPEAKER_FILEID)\n",
" else:\n",
" SPEAKER_FILEID = None\n",
"else:\n",
" SPEAKER_FILEID = None\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "tNvVEoE30qY6"
},
"source": [
"Synthesize sentence with Speaker\n",
"\n",
"> Stop running the cell to leave!\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "2o8fXkVSyXOa"
},
"outputs": [],
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "vnV-FigfvsS2"
},
"source": [
"# **Select Speaker**\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "RuCGOnJ_fgDV"
},
"outputs": [],
"source": [
"\n",
"# VCTK speakers not seen in training (new speakers)\n",
"VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n",
"\n",
"# VCTK speakers seen in training\n",
"VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n",
"\n",
"\n",
"num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "hkvv7gRcx4WV"
},
"source": [
"## **Example select a VCTK seen speaker in training**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "BviNMI9UyCYz"
},
"outputs": [],
"source": [
"# get embedding\n",
"Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" if C.use_external_speaker_embedding_file:\n",
" speaker_embeddings = []\n",
" for key in list(speaker_mapping.keys()):\n",
" if Speaker_choise in key:\n",
" if len(speaker_embeddings) < num_samples_speaker:\n",
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
" # takes the average of the embedings samples of the announcers\n",
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "5e5_XnLsx3jg"
},
"outputs": [],
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "QJ6VgT2a4vHW"
},
"source": [
"## **Example select a VCTK not seen speaker in training (new Speakers)**\n",
"\n",
"\n",
"> Fitting new Speakers :)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "SZS57ZK-4vHa"
},
"outputs": [],
"source": [
"# get embedding\n",
"Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" if C.use_external_speaker_embedding_file:\n",
" speaker_embeddings = []\n",
" for key in list(speaker_mapping.keys()):\n",
" if Speaker_choise in key:\n",
" if len(speaker_embeddings) < num_samples_speaker:\n",
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
" # takes the average of the embedings samples of the announcers\n",
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "bbs85vzz4vHo"
},
"outputs": [],
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "LEE6mQLh5Who"
},
"source": [
"# **Example Synthesizing with your own voice :)**\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "La70gSB65nrs"
},
"source": [
" Download and load GE2E Speaker Encoder "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "r0IEFZ0B5vQg"
},
"outputs": [],
"source": [
"!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n",
"!unzip ./SpeakerEncoder-checkpoint.zip"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "jEH8HCTh5mF6"
},
"outputs": [],
"source": [
"SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n",
"SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n",
"SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n",
"USE_CUDA = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "tOwkfQqT6-Qo"
},
"outputs": [],
"source": [
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
"se_config = load_config(SE_CONFIG_PATH)\n",
"se_ap = AudioProcessor(**se_config['audio'])\n",
"\n",
"se_model = SpeakerEncoder(**se_config.model)\n",
"se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n",
"se_model.eval()\n",
"if USE_CUDA:\n",
" se_model.cuda()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "0TLlbUFG8O36"
},
"source": [
"Upload a wav audio file in your voice.\n",
"\n",
"\n",
"> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "_FWwHPjJ8NXl"
},
"outputs": [],
"source": [
"from google.colab import files\n",
"file_list = files.upload()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "WWOf6sgbBbGY"
},
"outputs": [],
"source": [
"# extract embedding from wav files\n",
"speaker_embeddings = []\n",
"for name in file_list.keys():\n",
" if '.wav' in name:\n",
" mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n",
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
" if USE_CUDA:\n",
" mel_spec = mel_spec.cuda()\n",
" embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
" speaker_embeddings.append(embedd)\n",
" else:\n",
" print(\" You need upload Wav files, others files is not supported !!\")\n",
"\n",
"# takes the average of the embedings samples of the announcers\n",
"speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "xmItcGac5WiG"
},
"outputs": [],
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [
"vnV-FigfvsS2",
"hkvv7gRcx4WV",
"QJ6VgT2a4vHW"
],
"name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,847 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "yZK6UdwSFnOO"
},
"source": [
"# **Download and install Coqui TTS**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "yvb0pX3WY6MN"
},
"outputs": [],
"source": [
"import os \n",
"!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "iB9nl2UEG3SY"
},
"outputs": [],
"source": [
"!apt-get install espeak\n",
"os.chdir('TTS')\n",
"!pip install -r requirements.txt\n",
"!python setup.py develop\n",
"os.chdir('..')"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "w6Krn8k1inC_"
},
"source": [
"\n",
"\n",
"**Download Checkpoint**\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "PiYHf3lKhi9z"
},
"outputs": [],
"source": [
"!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018-with-GST.zip\n",
"!unzip ./TTS-checkpoint.zip\n",
"\n",
"# Download gst style example\n",
"!wget https://github.com/Edresson/TTS/releases/download/v1.0.0/gst-style-example.wav"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "MpYNgqrZcJKn"
},
"source": [
"**Utils Functions**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "4KZA4b_CbMqx"
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import argparse\n",
"import json\n",
"# pylint: disable=redefined-outer-name, unused-argument\n",
"import os\n",
"import string\n",
"import time\n",
"import sys\n",
"import numpy as np\n",
"\n",
"TTS_PATH = \"../content/TTS\"\n",
"# add libraries into environment\n",
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
"\n",
"import torch\n",
"\n",
"from TTS.tts.utils.generic_utils import setup_model\n",
"from TTS.tts.utils.synthesis import synthesis\n",
"from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.io import load_config\n",
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
"\n",
"\n",
"def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):\n",
" t_1 = time.time()\n",
" waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" if use_cuda and not use_gl:\n",
" waveform = waveform.cpu()\n",
" if not use_gl:\n",
" waveform = waveform.numpy()\n",
" waveform = waveform.squeeze()\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" tps = (time.time() - t_1) / len(waveform)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" print(\" > Time per step: {}\".format(tps))\n",
" return waveform\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ENA2OumIVeMA"
},
"source": [
"# **Vars definitions**\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "jPD0d_XpVXmY"
},
"outputs": [],
"source": [
"TEXT = ''\n",
"OUT_PATH = 'tests-audios/'\n",
"# create output path\n",
"os.makedirs(OUT_PATH, exist_ok=True)\n",
"\n",
"SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n",
"\n",
"# model vars \n",
"MODEL_PATH = 'best_model.pth.tar'\n",
"CONFIG_PATH = 'config.json'\n",
"SPEAKER_JSON = 'speakers.json'\n",
"\n",
"# vocoder vars\n",
"VOCODER_PATH = ''\n",
"VOCODER_CONFIG_PATH = ''\n",
"\n",
"USE_CUDA = True"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "dV6cXXlfi72r"
},
"source": [
"# **Restore TTS Model**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "x1WgLFauWUPe"
},
"outputs": [],
"source": [
"# load the config\n",
"C = load_config(CONFIG_PATH)\n",
"C.forward_attn_mask = True\n",
"\n",
"# load the audio processor\n",
"ap = AudioProcessor(**C.audio)\n",
"\n",
"# if the vocabulary was passed, replace the default\n",
"if 'characters' in C.keys():\n",
" symbols, phonemes = make_symbols(**C.characters)\n",
"\n",
"speaker_embedding = None\n",
"speaker_embedding_dim = None\n",
"num_speakers = 0\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" num_speakers = len(speaker_mapping)\n",
" if C.use_external_speaker_embedding_file:\n",
" if SPEAKER_FILEID is not None:\n",
" speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n",
" else: # if speaker_fileid is not specificated use the first sample in speakers.json\n",
" choise_speaker = list(speaker_mapping.keys())[0]\n",
" print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n",
" speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n",
" speaker_embedding_dim = len(speaker_embedding)\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n",
"cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
"model.load_state_dict(cp['model'])\n",
"model.eval()\n",
"\n",
"if USE_CUDA:\n",
" model.cuda()\n",
"\n",
"model.decoder.set_r(cp['r'])\n",
"\n",
"# load vocoder model\n",
"if VOCODER_PATH!= \"\":\n",
" VC = load_config(VOCODER_CONFIG_PATH)\n",
" vocoder_model = setup_generator(VC)\n",
" vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n",
" vocoder_model.remove_weight_norm()\n",
" if USE_CUDA:\n",
" vocoder_model.cuda()\n",
" vocoder_model.eval()\n",
"else:\n",
" vocoder_model = None\n",
" VC = None\n",
"\n",
"# synthesize voice\n",
"use_griffin_lim = VOCODER_PATH== \"\"\n",
"\n",
"if not C.use_external_speaker_embedding_file:\n",
" if SPEAKER_FILEID.isdigit():\n",
" SPEAKER_FILEID = int(SPEAKER_FILEID)\n",
" else:\n",
" SPEAKER_FILEID = None\n",
"else:\n",
" SPEAKER_FILEID = None\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "tNvVEoE30qY6"
},
"source": [
"Synthesize sentence with Speaker\n",
"\n",
"> Stop running the cell to leave!\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "2o8fXkVSyXOa"
},
"outputs": [],
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n",
"gst_style = 'gst-style-example.wav'\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "vnV-FigfvsS2"
},
"source": [
"# **Select Speaker**\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "RuCGOnJ_fgDV"
},
"outputs": [],
"source": [
"\n",
"# VCTK speakers not seen in training (new speakers)\n",
"VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n",
"\n",
"# VCTK speakers seen in training\n",
"VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n",
"\n",
"\n",
"num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "hkvv7gRcx4WV"
},
"source": [
"## **Example select a VCTK seen speaker in training**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "BviNMI9UyCYz"
},
"outputs": [],
"source": [
"# get embedding\n",
"Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" if C.use_external_speaker_embedding_file:\n",
" speaker_embeddings = []\n",
" for key in list(speaker_mapping.keys()):\n",
" if Speaker_choise in key:\n",
" if len(speaker_embeddings) < num_samples_speaker:\n",
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
" # takes the average of the embedings samples of the announcers\n",
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "5e5_XnLsx3jg"
},
"outputs": [],
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n",
"gst_style = 'gst-style-example.wav'\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "QJ6VgT2a4vHW"
},
"source": [
"## **Example select a VCTK not seen speaker in training (new Speakers)**\n",
"\n",
"\n",
"> Fitting new Speakers :)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "SZS57ZK-4vHa"
},
"outputs": [],
"source": [
"# get embedding\n",
"Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n",
"# load speakers\n",
"if SPEAKER_JSON != '':\n",
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
" if C.use_external_speaker_embedding_file:\n",
" speaker_embeddings = []\n",
" for key in list(speaker_mapping.keys()):\n",
" if Speaker_choise in key:\n",
" if len(speaker_embeddings) < num_samples_speaker:\n",
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
" # takes the average of the embedings samples of the announcers\n",
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "bbs85vzz4vHo"
},
"outputs": [],
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"gst_style = 'gst-style-example.wav'\n",
"while True:\n",
" TEXT = input(\"Enter sentence: \")\n",
" print(\" > Text: {}\".format(TEXT))\n",
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
" # save the results\n",
" file_name = TEXT.replace(\" \", \"_\")\n",
" file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
" out_path = os.path.join(OUT_PATH, file_name)\n",
" print(\" > Saving output to {}\".format(out_path))\n",
" ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "g_G_HweN04W-"
},
"source": [
"# **Changing GST tokens manually (without wav reference)**"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "jyFP5syW2bjt"
},
"source": [
"You can define tokens manually, this way you can increase/decrease the function of a given GST token. For example a token is responsible for the length of the speaker's pauses, if you increase the value of that token you will have longer pauses and if you decrease it you will have shorter pauses."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "SpwjDjCM2a3Y"
},
"outputs": [],
"source": [
"# set gst tokens, in this model we have 5 tokens\n",
"gst_style = {\"0\": 0, \"1\": 0, \"3\": 0, \"4\": 0}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "qWChMbI_0z5X"
},
"outputs": [],
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "uFjUi9xQ3mG3"
},
"outputs": [],
"source": [
"gst_style = {\"0\": 0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Uw0d6gWg4L27"
},
"outputs": [],
"source": [
"gst_style = {\"0\": -0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "V9izw4-54-Tl"
},
"outputs": [],
"source": [
"gst_style = {\"0\": 0, \"1\": 0.9, \"3\": 0, \"4\": 0}\n",
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "LEE6mQLh5Who"
},
"source": [
"# **Example Synthesizing with your own voice :)**\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "La70gSB65nrs"
},
"source": [
" Download and load GE2E Speaker Encoder "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "r0IEFZ0B5vQg"
},
"outputs": [],
"source": [
"!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n",
"!unzip ./SpeakerEncoder-checkpoint.zip"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "jEH8HCTh5mF6"
},
"outputs": [],
"source": [
"SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n",
"SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n",
"SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n",
"USE_CUDA = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "tOwkfQqT6-Qo"
},
"outputs": [],
"source": [
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
"se_config = load_config(SE_CONFIG_PATH)\n",
"se_ap = AudioProcessor(**se_config['audio'])\n",
"\n",
"se_model = SpeakerEncoder(**se_config.model)\n",
"se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n",
"se_model.eval()\n",
"if USE_CUDA:\n",
" se_model.cuda()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "0TLlbUFG8O36"
},
"source": [
"Upload one or more wav audio files in your voice.\n",
"\n",
"\n",
"> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "_FWwHPjJ8NXl"
},
"outputs": [],
"source": [
"# select one or more wav files\n",
"from google.colab import files\n",
"file_list = files.upload()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "WWOf6sgbBbGY"
},
"outputs": [],
"source": [
"# extract embedding from wav files\n",
"speaker_embeddings = []\n",
"for name in file_list.keys():\n",
" if '.wav' in name:\n",
" mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n",
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
" if USE_CUDA:\n",
" mel_spec = mel_spec.cuda()\n",
" embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
" speaker_embeddings.append(embedd)\n",
" else:\n",
" print(\"You need upload Wav files, others files is not supported !!\")\n",
"\n",
"# takes the average of the embedings samples of the announcers\n",
"speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "AQ7eP31d9yzq"
},
"outputs": [],
"source": [
"import IPython\n",
"from IPython.display import Audio\n",
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
"gst_style = {\"0\": 0, \"1\": 0.0, \"3\": 0, \"4\": 0}\n",
"gst_style = 'gst-style-example.wav'\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "11i10yE1-LMJ"
},
"source": [
"Uploading your own GST reference wav file"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "eKohSQG1-KkT"
},
"outputs": [],
"source": [
"# select one wav file for GST reference\n",
"from google.colab import files\n",
"file_list = files.upload()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "xmItcGac5WiG"
},
"outputs": [],
"source": [
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
"gst_style = list(file_list.keys())[0]\n",
"TEXT = input(\"Enter sentence: \")\n",
"print(\" > Text: {}\".format(TEXT))\n",
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
"# save the results\n",
"file_name = TEXT.replace(\" \", \"_\")\n",
"file_name = file_name.translate(\n",
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
"out_path = os.path.join(OUT_PATH, file_name)\n",
"print(\" > Saving output to {}\".format(out_path))\n",
"ap.save_wav(wav, out_path)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [
"yZK6UdwSFnOO",
"ENA2OumIVeMA",
"dV6cXXlfi72r",
"vnV-FigfvsS2",
"g_G_HweN04W-",
"LEE6mQLh5Who"
],
"name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018-With-GST.ipynb",
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -7,7 +7,6 @@ librosa==0.8.0
matplotlib
numpy==1.18.5
pandas
phonemizer>=2.2.0
pypinyin
pysbd
pyyaml