mirror of https://github.com/coqui-ai/TTS.git
[ci skip] Clear outputs in notebooks
parent
87ff9779d6
commit
121eb89d23
File diff suppressed because one or more lines are too long
|
@ -1,329 +1,342 @@
|
|||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
|
||||
"provenance": [],
|
||||
"collapsed_sections": [],
|
||||
"toc_visible": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "6LWsNd3_M3MP"
|
||||
},
|
||||
"source": [
|
||||
"# Mozilla TTS on CPU Real-Time Speech Synthesis "
|
||||
]
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "6LWsNd3_M3MP",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# Mozilla TTS on CPU Real-Time Speech Synthesis "
|
||||
]
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "FAqrSIWgLyP0"
|
||||
},
|
||||
"source": [
|
||||
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
|
||||
"\n",
|
||||
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
|
||||
"\n",
|
||||
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
|
||||
"\n",
|
||||
"Note that both model performances can be improved with more training."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Ku-dA4DKoeXk"
|
||||
},
|
||||
"source": [
|
||||
"### Download Models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 162
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "FAqrSIWgLyP0",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
|
||||
"\n",
|
||||
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
|
||||
"\n",
|
||||
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
|
||||
"\n",
|
||||
"Note that both model performances can be improved with more training."
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "jGIgnWhGsxU1",
|
||||
"outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
|
||||
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 235
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Ku-dA4DKoeXk",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"### Download Models"
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "4dnpE0-kvTsu",
|
||||
"outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n",
|
||||
"!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n",
|
||||
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Zlgi8fPdpRF0"
|
||||
},
|
||||
"source": [
|
||||
"### Define TTS function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "f-Yc42nQZG5A"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
|
||||
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" waveform = waveform.flatten()\n",
|
||||
" if use_cuda:\n",
|
||||
" waveform = waveform.cpu()\n",
|
||||
" waveform = waveform.numpy()\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
||||
" print(waveform.shape)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" print(\" > Time per step: {}\".format(tps))\n",
|
||||
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
|
||||
" return alignment, mel_postnet_spec, stop_tokens, waveform"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "ZksegYQepkFg"
|
||||
},
|
||||
"source": [
|
||||
"### Load Models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "oVa0kOamprgj"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import time\n",
|
||||
"import IPython\n",
|
||||
"\n",
|
||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "EY-sHVO8IFSH"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# runtime settings\n",
|
||||
"use_cuda = False"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "_1aIUp2FpxOQ"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# model paths\n",
|
||||
"TTS_MODEL = \"data/tts_model.pth.tar\"\n",
|
||||
"TTS_CONFIG = \"data/config.json\"\n",
|
||||
"VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
|
||||
"VOCODER_CONFIG = \"data/config_vocoder.json\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "CpgmdBVQplbv"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load configs\n",
|
||||
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
|
||||
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 471
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "jGIgnWhGsxU1",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 162
|
||||
},
|
||||
"outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
|
||||
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"colab_type": "code",
|
||||
"id": "zmrQxiozIUVE",
|
||||
"outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load the audio processor\n",
|
||||
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
|
||||
"ap = AudioProcessor(**TTS_CONFIG.audio) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 35
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "4dnpE0-kvTsu",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 235
|
||||
},
|
||||
"outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n",
|
||||
"!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n",
|
||||
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"colab_type": "code",
|
||||
"id": "8fLoI4ipqMeS",
|
||||
"outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# LOAD TTS MODEL\n",
|
||||
"# multi speaker \n",
|
||||
"speaker_id = None\n",
|
||||
"speakers = []\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
|
||||
"\n",
|
||||
"# load model state\n",
|
||||
"cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"model.load_state_dict(cp['model'])\n",
|
||||
"if use_cuda:\n",
|
||||
" model.cuda()\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"# set model stepsize\n",
|
||||
"if 'r' in cp:\n",
|
||||
" model.decoder.set_r(cp['r'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 1000
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Zlgi8fPdpRF0",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"### Define TTS function"
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "zKoq0GgzqzhQ",
|
||||
"outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
|
||||
"\n",
|
||||
"# LOAD VOCODER MODEL\n",
|
||||
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
|
||||
"vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
|
||||
"vocoder_model.remove_weight_norm()\n",
|
||||
"vocoder_model.inference_padding = 0\n",
|
||||
"\n",
|
||||
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
|
||||
"if use_cuda:\n",
|
||||
" vocoder_model.cuda()\n",
|
||||
"vocoder_model.eval()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Ws_YkPKsLgo-"
|
||||
},
|
||||
"source": [
|
||||
"## Run Inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 134
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "f-Yc42nQZG5A",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
|
||||
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" waveform = waveform.flatten()\n",
|
||||
" if use_cuda:\n",
|
||||
" waveform = waveform.cpu()\n",
|
||||
" waveform = waveform.numpy()\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
||||
" print(waveform.shape)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" print(\" > Time per step: {}\".format(tps))\n",
|
||||
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
|
||||
" return alignment, mel_postnet_spec, stop_tokens, waveform"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "ZksegYQepkFg",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"### Load Models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "oVa0kOamprgj",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import time\n",
|
||||
"import IPython\n",
|
||||
"\n",
|
||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "EY-sHVO8IFSH",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# runtime settings\n",
|
||||
"use_cuda = False"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "_1aIUp2FpxOQ",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# model paths\n",
|
||||
"TTS_MODEL = \"data/tts_model.pth.tar\"\n",
|
||||
"TTS_CONFIG = \"data/config.json\"\n",
|
||||
"VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
|
||||
"VOCODER_CONFIG = \"data/config_vocoder.json\""
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "CpgmdBVQplbv",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# load configs\n",
|
||||
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
|
||||
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "zmrQxiozIUVE",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 471
|
||||
},
|
||||
"outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# load the audio processor\n",
|
||||
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
|
||||
"ap = AudioProcessor(**TTS_CONFIG.audio) "
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "8fLoI4ipqMeS",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 35
|
||||
},
|
||||
"outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# LOAD TTS MODEL\n",
|
||||
"# multi speaker \n",
|
||||
"speaker_id = None\n",
|
||||
"speakers = []\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
|
||||
"\n",
|
||||
"# load model state\n",
|
||||
"cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"model.load_state_dict(cp['model'])\n",
|
||||
"if use_cuda:\n",
|
||||
" model.cuda()\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"# set model stepsize\n",
|
||||
"if 'r' in cp:\n",
|
||||
" model.decoder.set_r(cp['r'])"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "zKoq0GgzqzhQ",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 1000
|
||||
},
|
||||
"outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
|
||||
"\n",
|
||||
"# LOAD VOCODER MODEL\n",
|
||||
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
|
||||
"vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
|
||||
"vocoder_model.remove_weight_norm()\n",
|
||||
"vocoder_model.inference_padding = 0\n",
|
||||
"\n",
|
||||
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
|
||||
"if use_cuda:\n",
|
||||
" vocoder_model.cuda()\n",
|
||||
"vocoder_model.eval()"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Ws_YkPKsLgo-",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"## Run Inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "FuWxZ9Ey5Puj",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 134
|
||||
},
|
||||
"outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
]
|
||||
}
|
||||
"colab_type": "code",
|
||||
"id": "FuWxZ9Ey5Puj",
|
||||
"outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"collapsed_sections": [],
|
||||
"name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
|
||||
"provenance": [],
|
||||
"toc_visible": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -338,7 +338,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.7"
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -1,329 +1,342 @@
|
|||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
|
||||
"provenance": [],
|
||||
"collapsed_sections": [],
|
||||
"toc_visible": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "6LWsNd3_M3MP"
|
||||
},
|
||||
"source": [
|
||||
"# Mozilla TTS on CPU Real-Time Speech Synthesis "
|
||||
]
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "6LWsNd3_M3MP",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# Mozilla TTS on CPU Real-Time Speech Synthesis "
|
||||
]
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "FAqrSIWgLyP0"
|
||||
},
|
||||
"source": [
|
||||
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
|
||||
"\n",
|
||||
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
|
||||
"\n",
|
||||
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
|
||||
"\n",
|
||||
"Note that both model performances can be improved with more training."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Ku-dA4DKoeXk"
|
||||
},
|
||||
"source": [
|
||||
"### Download Models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 162
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "FAqrSIWgLyP0",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
|
||||
"\n",
|
||||
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
|
||||
"\n",
|
||||
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
|
||||
"\n",
|
||||
"Note that both model performances can be improved with more training."
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "jGIgnWhGsxU1",
|
||||
"outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
|
||||
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 235
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Ku-dA4DKoeXk",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"### Download Models"
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "4dnpE0-kvTsu",
|
||||
"outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gdown --id 1X09hHAyAJOnrplCUMAdW_t341Kor4YR4 -O data/vocoder_model.pth.tar\n",
|
||||
"!gdown --id \"1qN7vQRIYkzvOX_DtiZtTajzoZ1eW1-Eg\" -O data/config_vocoder.json\n",
|
||||
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Zlgi8fPdpRF0"
|
||||
},
|
||||
"source": [
|
||||
"### Define TTS function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "f-Yc42nQZG5A"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
|
||||
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" waveform = waveform.flatten()\n",
|
||||
" if use_cuda:\n",
|
||||
" waveform = waveform.cpu()\n",
|
||||
" waveform = waveform.numpy()\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
||||
" print(waveform.shape)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" print(\" > Time per step: {}\".format(tps))\n",
|
||||
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
|
||||
" return alignment, mel_postnet_spec, stop_tokens, waveform"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "ZksegYQepkFg"
|
||||
},
|
||||
"source": [
|
||||
"### Load Models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "oVa0kOamprgj"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import time\n",
|
||||
"import IPython\n",
|
||||
"\n",
|
||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "EY-sHVO8IFSH"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# runtime settings\n",
|
||||
"use_cuda = False"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "_1aIUp2FpxOQ"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# model paths\n",
|
||||
"TTS_MODEL = \"data/tts_model.pth.tar\"\n",
|
||||
"TTS_CONFIG = \"data/config.json\"\n",
|
||||
"VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
|
||||
"VOCODER_CONFIG = \"data/config_vocoder.json\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "CpgmdBVQplbv"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load configs\n",
|
||||
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
|
||||
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 471
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "jGIgnWhGsxU1",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 162
|
||||
},
|
||||
"outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
|
||||
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"colab_type": "code",
|
||||
"id": "zmrQxiozIUVE",
|
||||
"outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load the audio processor\n",
|
||||
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
|
||||
"ap = AudioProcessor(**TTS_CONFIG.audio) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 35
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "4dnpE0-kvTsu",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 235
|
||||
},
|
||||
"outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"!gdown --id 1X09hHAyAJOnrplCUMAdW_t341Kor4YR4 -O data/vocoder_model.pth.tar\n",
|
||||
"!gdown --id \"1qN7vQRIYkzvOX_DtiZtTajzoZ1eW1-Eg\" -O data/config_vocoder.json\n",
|
||||
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"colab_type": "code",
|
||||
"id": "8fLoI4ipqMeS",
|
||||
"outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# LOAD TTS MODEL\n",
|
||||
"# multi speaker \n",
|
||||
"speaker_id = None\n",
|
||||
"speakers = []\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
|
||||
"\n",
|
||||
"# load model state\n",
|
||||
"cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"model.load_state_dict(cp['model'])\n",
|
||||
"if use_cuda:\n",
|
||||
" model.cuda()\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"# set model stepsize\n",
|
||||
"if 'r' in cp:\n",
|
||||
" model.decoder.set_r(cp['r'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 1000
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Zlgi8fPdpRF0",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"### Define TTS function"
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "zKoq0GgzqzhQ",
|
||||
"outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
|
||||
"\n",
|
||||
"# LOAD VOCODER MODEL\n",
|
||||
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
|
||||
"vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
|
||||
"vocoder_model.remove_weight_norm()\n",
|
||||
"vocoder_model.inference_padding = 0\n",
|
||||
"\n",
|
||||
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
|
||||
"if use_cuda:\n",
|
||||
" vocoder_model.cuda()\n",
|
||||
"vocoder_model.eval()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Ws_YkPKsLgo-"
|
||||
},
|
||||
"source": [
|
||||
"## Run Inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 134
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "f-Yc42nQZG5A",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
|
||||
" # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" waveform = waveform.flatten()\n",
|
||||
" if use_cuda:\n",
|
||||
" waveform = waveform.cpu()\n",
|
||||
" waveform = waveform.numpy()\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
||||
" print(waveform.shape)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" print(\" > Time per step: {}\".format(tps))\n",
|
||||
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
|
||||
" return alignment, mel_postnet_spec, stop_tokens, waveform"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "ZksegYQepkFg",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"### Load Models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "oVa0kOamprgj",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import time\n",
|
||||
"import IPython\n",
|
||||
"\n",
|
||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "EY-sHVO8IFSH",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# runtime settings\n",
|
||||
"use_cuda = False"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "_1aIUp2FpxOQ",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# model paths\n",
|
||||
"TTS_MODEL = \"data/tts_model.pth.tar\"\n",
|
||||
"TTS_CONFIG = \"data/config.json\"\n",
|
||||
"VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
|
||||
"VOCODER_CONFIG = \"data/config_vocoder.json\""
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "CpgmdBVQplbv",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# load configs\n",
|
||||
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
|
||||
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "zmrQxiozIUVE",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 471
|
||||
},
|
||||
"outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# load the audio processor\n",
|
||||
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
|
||||
"ap = AudioProcessor(**TTS_CONFIG.audio) "
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "8fLoI4ipqMeS",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 35
|
||||
},
|
||||
"outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# LOAD TTS MODEL\n",
|
||||
"# multi speaker \n",
|
||||
"speaker_id = None\n",
|
||||
"speakers = []\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
|
||||
"\n",
|
||||
"# load model state\n",
|
||||
"cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"model.load_state_dict(cp['model'])\n",
|
||||
"if use_cuda:\n",
|
||||
" model.cuda()\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"# set model stepsize\n",
|
||||
"if 'r' in cp:\n",
|
||||
" model.decoder.set_r(cp['r'])"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "zKoq0GgzqzhQ",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 1000
|
||||
},
|
||||
"outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
|
||||
"\n",
|
||||
"# LOAD VOCODER MODEL\n",
|
||||
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
|
||||
"vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
|
||||
"vocoder_model.remove_weight_norm()\n",
|
||||
"vocoder_model.inference_padding = 0\n",
|
||||
"\n",
|
||||
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
|
||||
"if use_cuda:\n",
|
||||
" vocoder_model.cuda()\n",
|
||||
"vocoder_model.eval()"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Ws_YkPKsLgo-",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"## Run Inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "FuWxZ9Ey5Puj",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 134
|
||||
},
|
||||
"outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
]
|
||||
}
|
||||
"colab_type": "code",
|
||||
"id": "FuWxZ9Ey5Puj",
|
||||
"outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"collapsed_sections": [],
|
||||
"name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
|
||||
"provenance": [],
|
||||
"toc_visible": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
@ -0,0 +1,650 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "yZK6UdwSFnOO"
|
||||
},
|
||||
"source": [
|
||||
"# **Download and install Coqui TTS**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "yvb0pX3WY6MN"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os \n",
|
||||
"!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "iB9nl2UEG3SY"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!apt-get install espeak\n",
|
||||
"os.chdir('TTS')\n",
|
||||
"!pip install -r requirements.txt\n",
|
||||
"!python setup.py develop\n",
|
||||
"os.chdir('..')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "w6Krn8k1inC_"
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"**Download Checkpoint**\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "PiYHf3lKhi9z"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018.zip\n",
|
||||
"!unzip ./TTS-checkpoint.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "MpYNgqrZcJKn"
|
||||
},
|
||||
"source": [
|
||||
"**Utils Functions**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "4KZA4b_CbMqx"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"import argparse\n",
|
||||
"import json\n",
|
||||
"# pylint: disable=redefined-outer-name, unused-argument\n",
|
||||
"import os\n",
|
||||
"import string\n",
|
||||
"import time\n",
|
||||
"import sys\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"TTS_PATH = \"../content/TTS\"\n",
|
||||
"# add libraries into environment\n",
|
||||
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis\n",
|
||||
"from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n",
|
||||
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
|
||||
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" if use_cuda and not use_gl:\n",
|
||||
" waveform = waveform.cpu()\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = waveform.numpy()\n",
|
||||
" waveform = waveform.squeeze()\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" print(\" > Time per step: {}\".format(tps))\n",
|
||||
" return waveform\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "ENA2OumIVeMA"
|
||||
},
|
||||
"source": [
|
||||
"# **Vars definitions**\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "jPD0d_XpVXmY"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TEXT = ''\n",
|
||||
"OUT_PATH = 'tests-audios/'\n",
|
||||
"# create output path\n",
|
||||
"os.makedirs(OUT_PATH, exist_ok=True)\n",
|
||||
"\n",
|
||||
"SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n",
|
||||
"\n",
|
||||
"# model vars \n",
|
||||
"MODEL_PATH = 'best_model.pth.tar'\n",
|
||||
"CONFIG_PATH = 'config.json'\n",
|
||||
"SPEAKER_JSON = 'speakers.json'\n",
|
||||
"\n",
|
||||
"# vocoder vars\n",
|
||||
"VOCODER_PATH = ''\n",
|
||||
"VOCODER_CONFIG_PATH = ''\n",
|
||||
"\n",
|
||||
"USE_CUDA = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "dV6cXXlfi72r"
|
||||
},
|
||||
"source": [
|
||||
"# **Restore TTS Model**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "x1WgLFauWUPe"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load the config\n",
|
||||
"C = load_config(CONFIG_PATH)\n",
|
||||
"C.forward_attn_mask = True\n",
|
||||
"\n",
|
||||
"# load the audio processor\n",
|
||||
"ap = AudioProcessor(**C.audio)\n",
|
||||
"\n",
|
||||
"# if the vocabulary was passed, replace the default\n",
|
||||
"if 'characters' in C.keys():\n",
|
||||
" symbols, phonemes = make_symbols(**C.characters)\n",
|
||||
"\n",
|
||||
"speaker_embedding = None\n",
|
||||
"speaker_embedding_dim = None\n",
|
||||
"num_speakers = 0\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" num_speakers = len(speaker_mapping)\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" if SPEAKER_FILEID is not None:\n",
|
||||
" speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n",
|
||||
" else: # if speaker_fileid is not specificated use the first sample in speakers.json\n",
|
||||
" choise_speaker = list(speaker_mapping.keys())[0]\n",
|
||||
" print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n",
|
||||
" speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n",
|
||||
" speaker_embedding_dim = len(speaker_embedding)\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n",
|
||||
"cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
|
||||
"model.load_state_dict(cp['model'])\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"if USE_CUDA:\n",
|
||||
" model.cuda()\n",
|
||||
"\n",
|
||||
"model.decoder.set_r(cp['r'])\n",
|
||||
"\n",
|
||||
"# load vocoder model\n",
|
||||
"if VOCODER_PATH!= \"\":\n",
|
||||
" VC = load_config(VOCODER_CONFIG_PATH)\n",
|
||||
" vocoder_model = setup_generator(VC)\n",
|
||||
" vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n",
|
||||
" vocoder_model.remove_weight_norm()\n",
|
||||
" if USE_CUDA:\n",
|
||||
" vocoder_model.cuda()\n",
|
||||
" vocoder_model.eval()\n",
|
||||
"else:\n",
|
||||
" vocoder_model = None\n",
|
||||
" VC = None\n",
|
||||
"\n",
|
||||
"# synthesize voice\n",
|
||||
"use_griffin_lim = VOCODER_PATH== \"\"\n",
|
||||
"\n",
|
||||
"if not C.use_external_speaker_embedding_file:\n",
|
||||
" if SPEAKER_FILEID.isdigit():\n",
|
||||
" SPEAKER_FILEID = int(SPEAKER_FILEID)\n",
|
||||
" else:\n",
|
||||
" SPEAKER_FILEID = None\n",
|
||||
"else:\n",
|
||||
" SPEAKER_FILEID = None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "tNvVEoE30qY6"
|
||||
},
|
||||
"source": [
|
||||
"Synthesize sentence with Speaker\n",
|
||||
"\n",
|
||||
"> Stop running the cell to leave!\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "2o8fXkVSyXOa"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "vnV-FigfvsS2"
|
||||
},
|
||||
"source": [
|
||||
"# **Select Speaker**\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "RuCGOnJ_fgDV"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# VCTK speakers not seen in training (new speakers)\n",
|
||||
"VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n",
|
||||
"\n",
|
||||
"# VCTK speakers seen in training\n",
|
||||
"VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "hkvv7gRcx4WV"
|
||||
},
|
||||
"source": [
|
||||
"## **Example select a VCTK seen speaker in training**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "BviNMI9UyCYz"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get embedding\n",
|
||||
"Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" speaker_embeddings = []\n",
|
||||
" for key in list(speaker_mapping.keys()):\n",
|
||||
" if Speaker_choise in key:\n",
|
||||
" if len(speaker_embeddings) < num_samples_speaker:\n",
|
||||
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
|
||||
" # takes the average of the embedings samples of the announcers\n",
|
||||
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "5e5_XnLsx3jg"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "QJ6VgT2a4vHW"
|
||||
},
|
||||
"source": [
|
||||
"## **Example select a VCTK not seen speaker in training (new Speakers)**\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"> Fitting new Speakers :)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "SZS57ZK-4vHa"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get embedding\n",
|
||||
"Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" speaker_embeddings = []\n",
|
||||
" for key in list(speaker_mapping.keys()):\n",
|
||||
" if Speaker_choise in key:\n",
|
||||
" if len(speaker_embeddings) < num_samples_speaker:\n",
|
||||
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
|
||||
" # takes the average of the embedings samples of the announcers\n",
|
||||
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "bbs85vzz4vHo"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "LEE6mQLh5Who"
|
||||
},
|
||||
"source": [
|
||||
"# **Example Synthesizing with your own voice :)**\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "La70gSB65nrs"
|
||||
},
|
||||
"source": [
|
||||
" Download and load GE2E Speaker Encoder "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "r0IEFZ0B5vQg"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n",
|
||||
"!unzip ./SpeakerEncoder-checkpoint.zip"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "jEH8HCTh5mF6"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n",
|
||||
"SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n",
|
||||
"SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n",
|
||||
"USE_CUDA = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "tOwkfQqT6-Qo"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
|
||||
"se_config = load_config(SE_CONFIG_PATH)\n",
|
||||
"se_ap = AudioProcessor(**se_config['audio'])\n",
|
||||
"\n",
|
||||
"se_model = SpeakerEncoder(**se_config.model)\n",
|
||||
"se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n",
|
||||
"se_model.eval()\n",
|
||||
"if USE_CUDA:\n",
|
||||
" se_model.cuda()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "0TLlbUFG8O36"
|
||||
},
|
||||
"source": [
|
||||
"Upload a wav audio file in your voice.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "_FWwHPjJ8NXl"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from google.colab import files\n",
|
||||
"file_list = files.upload()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "WWOf6sgbBbGY"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# extract embedding from wav files\n",
|
||||
"speaker_embeddings = []\n",
|
||||
"for name in file_list.keys():\n",
|
||||
" if '.wav' in name:\n",
|
||||
" mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n",
|
||||
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
|
||||
" if USE_CUDA:\n",
|
||||
" mel_spec = mel_spec.cuda()\n",
|
||||
" embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
|
||||
" speaker_embeddings.append(embedd)\n",
|
||||
" else:\n",
|
||||
" print(\" You need upload Wav files, others files is not supported !!\")\n",
|
||||
"\n",
|
||||
"# takes the average of the embedings samples of the announcers\n",
|
||||
"speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "xmItcGac5WiG"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"collapsed_sections": [
|
||||
"vnV-FigfvsS2",
|
||||
"hkvv7gRcx4WV",
|
||||
"QJ6VgT2a4vHW"
|
||||
],
|
||||
"name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018.ipynb",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -0,0 +1,847 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "yZK6UdwSFnOO"
|
||||
},
|
||||
"source": [
|
||||
"# **Download and install Coqui TTS**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "yvb0pX3WY6MN"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os \n",
|
||||
"!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "iB9nl2UEG3SY"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!apt-get install espeak\n",
|
||||
"os.chdir('TTS')\n",
|
||||
"!pip install -r requirements.txt\n",
|
||||
"!python setup.py develop\n",
|
||||
"os.chdir('..')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "w6Krn8k1inC_"
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"**Download Checkpoint**\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "PiYHf3lKhi9z"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018-with-GST.zip\n",
|
||||
"!unzip ./TTS-checkpoint.zip\n",
|
||||
"\n",
|
||||
"# Download gst style example\n",
|
||||
"!wget https://github.com/Edresson/TTS/releases/download/v1.0.0/gst-style-example.wav"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "MpYNgqrZcJKn"
|
||||
},
|
||||
"source": [
|
||||
"**Utils Functions**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "4KZA4b_CbMqx"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"import argparse\n",
|
||||
"import json\n",
|
||||
"# pylint: disable=redefined-outer-name, unused-argument\n",
|
||||
"import os\n",
|
||||
"import string\n",
|
||||
"import time\n",
|
||||
"import sys\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"TTS_PATH = \"../content/TTS\"\n",
|
||||
"# add libraries into environment\n",
|
||||
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis\n",
|
||||
"from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n",
|
||||
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
|
||||
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" if use_cuda and not use_gl:\n",
|
||||
" waveform = waveform.cpu()\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = waveform.numpy()\n",
|
||||
" waveform = waveform.squeeze()\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" print(\" > Time per step: {}\".format(tps))\n",
|
||||
" return waveform\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "ENA2OumIVeMA"
|
||||
},
|
||||
"source": [
|
||||
"# **Vars definitions**\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "jPD0d_XpVXmY"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TEXT = ''\n",
|
||||
"OUT_PATH = 'tests-audios/'\n",
|
||||
"# create output path\n",
|
||||
"os.makedirs(OUT_PATH, exist_ok=True)\n",
|
||||
"\n",
|
||||
"SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n",
|
||||
"\n",
|
||||
"# model vars \n",
|
||||
"MODEL_PATH = 'best_model.pth.tar'\n",
|
||||
"CONFIG_PATH = 'config.json'\n",
|
||||
"SPEAKER_JSON = 'speakers.json'\n",
|
||||
"\n",
|
||||
"# vocoder vars\n",
|
||||
"VOCODER_PATH = ''\n",
|
||||
"VOCODER_CONFIG_PATH = ''\n",
|
||||
"\n",
|
||||
"USE_CUDA = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "dV6cXXlfi72r"
|
||||
},
|
||||
"source": [
|
||||
"# **Restore TTS Model**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "x1WgLFauWUPe"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load the config\n",
|
||||
"C = load_config(CONFIG_PATH)\n",
|
||||
"C.forward_attn_mask = True\n",
|
||||
"\n",
|
||||
"# load the audio processor\n",
|
||||
"ap = AudioProcessor(**C.audio)\n",
|
||||
"\n",
|
||||
"# if the vocabulary was passed, replace the default\n",
|
||||
"if 'characters' in C.keys():\n",
|
||||
" symbols, phonemes = make_symbols(**C.characters)\n",
|
||||
"\n",
|
||||
"speaker_embedding = None\n",
|
||||
"speaker_embedding_dim = None\n",
|
||||
"num_speakers = 0\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" num_speakers = len(speaker_mapping)\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" if SPEAKER_FILEID is not None:\n",
|
||||
" speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n",
|
||||
" else: # if speaker_fileid is not specificated use the first sample in speakers.json\n",
|
||||
" choise_speaker = list(speaker_mapping.keys())[0]\n",
|
||||
" print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n",
|
||||
" speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n",
|
||||
" speaker_embedding_dim = len(speaker_embedding)\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n",
|
||||
"cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
|
||||
"model.load_state_dict(cp['model'])\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"if USE_CUDA:\n",
|
||||
" model.cuda()\n",
|
||||
"\n",
|
||||
"model.decoder.set_r(cp['r'])\n",
|
||||
"\n",
|
||||
"# load vocoder model\n",
|
||||
"if VOCODER_PATH!= \"\":\n",
|
||||
" VC = load_config(VOCODER_CONFIG_PATH)\n",
|
||||
" vocoder_model = setup_generator(VC)\n",
|
||||
" vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n",
|
||||
" vocoder_model.remove_weight_norm()\n",
|
||||
" if USE_CUDA:\n",
|
||||
" vocoder_model.cuda()\n",
|
||||
" vocoder_model.eval()\n",
|
||||
"else:\n",
|
||||
" vocoder_model = None\n",
|
||||
" VC = None\n",
|
||||
"\n",
|
||||
"# synthesize voice\n",
|
||||
"use_griffin_lim = VOCODER_PATH== \"\"\n",
|
||||
"\n",
|
||||
"if not C.use_external_speaker_embedding_file:\n",
|
||||
" if SPEAKER_FILEID.isdigit():\n",
|
||||
" SPEAKER_FILEID = int(SPEAKER_FILEID)\n",
|
||||
" else:\n",
|
||||
" SPEAKER_FILEID = None\n",
|
||||
"else:\n",
|
||||
" SPEAKER_FILEID = None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "tNvVEoE30qY6"
|
||||
},
|
||||
"source": [
|
||||
"Synthesize sentence with Speaker\n",
|
||||
"\n",
|
||||
"> Stop running the cell to leave!\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "2o8fXkVSyXOa"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n",
|
||||
"gst_style = 'gst-style-example.wav'\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "vnV-FigfvsS2"
|
||||
},
|
||||
"source": [
|
||||
"# **Select Speaker**\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "RuCGOnJ_fgDV"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# VCTK speakers not seen in training (new speakers)\n",
|
||||
"VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n",
|
||||
"\n",
|
||||
"# VCTK speakers seen in training\n",
|
||||
"VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "hkvv7gRcx4WV"
|
||||
},
|
||||
"source": [
|
||||
"## **Example select a VCTK seen speaker in training**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "BviNMI9UyCYz"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get embedding\n",
|
||||
"Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" speaker_embeddings = []\n",
|
||||
" for key in list(speaker_mapping.keys()):\n",
|
||||
" if Speaker_choise in key:\n",
|
||||
" if len(speaker_embeddings) < num_samples_speaker:\n",
|
||||
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
|
||||
" # takes the average of the embedings samples of the announcers\n",
|
||||
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "5e5_XnLsx3jg"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n",
|
||||
"gst_style = 'gst-style-example.wav'\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "QJ6VgT2a4vHW"
|
||||
},
|
||||
"source": [
|
||||
"## **Example select a VCTK not seen speaker in training (new Speakers)**\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"> Fitting new Speakers :)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "SZS57ZK-4vHa"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get embedding\n",
|
||||
"Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" speaker_embeddings = []\n",
|
||||
" for key in list(speaker_mapping.keys()):\n",
|
||||
" if Speaker_choise in key:\n",
|
||||
" if len(speaker_embeddings) < num_samples_speaker:\n",
|
||||
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
|
||||
" # takes the average of the embedings samples of the announcers\n",
|
||||
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "bbs85vzz4vHo"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"gst_style = 'gst-style-example.wav'\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "g_G_HweN04W-"
|
||||
},
|
||||
"source": [
|
||||
"# **Changing GST tokens manually (without wav reference)**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "jyFP5syW2bjt"
|
||||
},
|
||||
"source": [
|
||||
"You can define tokens manually, this way you can increase/decrease the function of a given GST token. For example a token is responsible for the length of the speaker's pauses, if you increase the value of that token you will have longer pauses and if you decrease it you will have shorter pauses."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "SpwjDjCM2a3Y"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# set gst tokens, in this model we have 5 tokens\n",
|
||||
"gst_style = {\"0\": 0, \"1\": 0, \"3\": 0, \"4\": 0}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "qWChMbI_0z5X"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "uFjUi9xQ3mG3"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gst_style = {\"0\": 0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "Uw0d6gWg4L27"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gst_style = {\"0\": -0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "V9izw4-54-Tl"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gst_style = {\"0\": 0, \"1\": 0.9, \"3\": 0, \"4\": 0}\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "LEE6mQLh5Who"
|
||||
},
|
||||
"source": [
|
||||
"# **Example Synthesizing with your own voice :)**\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "La70gSB65nrs"
|
||||
},
|
||||
"source": [
|
||||
" Download and load GE2E Speaker Encoder "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "r0IEFZ0B5vQg"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n",
|
||||
"!unzip ./SpeakerEncoder-checkpoint.zip"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "jEH8HCTh5mF6"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n",
|
||||
"SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n",
|
||||
"SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n",
|
||||
"USE_CUDA = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "tOwkfQqT6-Qo"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
|
||||
"se_config = load_config(SE_CONFIG_PATH)\n",
|
||||
"se_ap = AudioProcessor(**se_config['audio'])\n",
|
||||
"\n",
|
||||
"se_model = SpeakerEncoder(**se_config.model)\n",
|
||||
"se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n",
|
||||
"se_model.eval()\n",
|
||||
"if USE_CUDA:\n",
|
||||
" se_model.cuda()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "0TLlbUFG8O36"
|
||||
},
|
||||
"source": [
|
||||
"Upload one or more wav audio files in your voice.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "_FWwHPjJ8NXl"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# select one or more wav files\n",
|
||||
"from google.colab import files\n",
|
||||
"file_list = files.upload()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "WWOf6sgbBbGY"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# extract embedding from wav files\n",
|
||||
"speaker_embeddings = []\n",
|
||||
"for name in file_list.keys():\n",
|
||||
" if '.wav' in name:\n",
|
||||
" mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n",
|
||||
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
|
||||
" if USE_CUDA:\n",
|
||||
" mel_spec = mel_spec.cuda()\n",
|
||||
" embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
|
||||
" speaker_embeddings.append(embedd)\n",
|
||||
" else:\n",
|
||||
" print(\"You need upload Wav files, others files is not supported !!\")\n",
|
||||
"\n",
|
||||
"# takes the average of the embedings samples of the announcers\n",
|
||||
"speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "AQ7eP31d9yzq"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"gst_style = {\"0\": 0, \"1\": 0.0, \"3\": 0, \"4\": 0}\n",
|
||||
"gst_style = 'gst-style-example.wav'\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "11i10yE1-LMJ"
|
||||
},
|
||||
"source": [
|
||||
"Uploading your own GST reference wav file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "eKohSQG1-KkT"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# select one wav file for GST reference\n",
|
||||
"from google.colab import files\n",
|
||||
"file_list = files.upload()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "xmItcGac5WiG"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"gst_style = list(file_list.keys())[0]\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"collapsed_sections": [
|
||||
"yZK6UdwSFnOO",
|
||||
"ENA2OumIVeMA",
|
||||
"dV6cXXlfi72r",
|
||||
"vnV-FigfvsS2",
|
||||
"g_G_HweN04W-",
|
||||
"LEE6mQLh5Who"
|
||||
],
|
||||
"name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018-With-GST.ipynb",
|
||||
"provenance": [],
|
||||
"toc_visible": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -1,637 +0,0 @@
|
|||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018.ipynb",
|
||||
"provenance": [],
|
||||
"collapsed_sections": [
|
||||
"vnV-FigfvsS2",
|
||||
"hkvv7gRcx4WV",
|
||||
"QJ6VgT2a4vHW"
|
||||
]
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "yZK6UdwSFnOO",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# **Download and install Mozilla TTS**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "yvb0pX3WY6MN",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import os \n",
|
||||
"!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "iB9nl2UEG3SY",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"!apt-get install espeak\n",
|
||||
"os.chdir('TTS')\n",
|
||||
"!pip install -r requirements.txt\n",
|
||||
"!python setup.py develop\n",
|
||||
"os.chdir('..')"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "w6Krn8k1inC_",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"**Download Checkpoint**\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "PiYHf3lKhi9z",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018.zip\n",
|
||||
"!unzip ./TTS-checkpoint.zip\n"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "MpYNgqrZcJKn",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"**Utils Functions**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "4KZA4b_CbMqx",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"import argparse\n",
|
||||
"import json\n",
|
||||
"# pylint: disable=redefined-outer-name, unused-argument\n",
|
||||
"import os\n",
|
||||
"import string\n",
|
||||
"import time\n",
|
||||
"import sys\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"TTS_PATH = \"../content/TTS\"\n",
|
||||
"# add libraries into environment\n",
|
||||
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis\n",
|
||||
"from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n",
|
||||
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
|
||||
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" if use_cuda and not use_gl:\n",
|
||||
" waveform = waveform.cpu()\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = waveform.numpy()\n",
|
||||
" waveform = waveform.squeeze()\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" print(\" > Time per step: {}\".format(tps))\n",
|
||||
" return waveform\n",
|
||||
"\n"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "ENA2OumIVeMA",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# **Vars definitions**\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "jPD0d_XpVXmY",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"TEXT = ''\n",
|
||||
"OUT_PATH = 'tests-audios/'\n",
|
||||
"# create output path\n",
|
||||
"os.makedirs(OUT_PATH, exist_ok=True)\n",
|
||||
"\n",
|
||||
"SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n",
|
||||
"\n",
|
||||
"# model vars \n",
|
||||
"MODEL_PATH = 'best_model.pth.tar'\n",
|
||||
"CONFIG_PATH = 'config.json'\n",
|
||||
"SPEAKER_JSON = 'speakers.json'\n",
|
||||
"\n",
|
||||
"# vocoder vars\n",
|
||||
"VOCODER_PATH = ''\n",
|
||||
"VOCODER_CONFIG_PATH = ''\n",
|
||||
"\n",
|
||||
"USE_CUDA = True"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "dV6cXXlfi72r",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# **Restore TTS Model**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "x1WgLFauWUPe",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# load the config\n",
|
||||
"C = load_config(CONFIG_PATH)\n",
|
||||
"C.forward_attn_mask = True\n",
|
||||
"\n",
|
||||
"# load the audio processor\n",
|
||||
"ap = AudioProcessor(**C.audio)\n",
|
||||
"\n",
|
||||
"# if the vocabulary was passed, replace the default\n",
|
||||
"if 'characters' in C.keys():\n",
|
||||
" symbols, phonemes = make_symbols(**C.characters)\n",
|
||||
"\n",
|
||||
"speaker_embedding = None\n",
|
||||
"speaker_embedding_dim = None\n",
|
||||
"num_speakers = 0\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" num_speakers = len(speaker_mapping)\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" if SPEAKER_FILEID is not None:\n",
|
||||
" speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n",
|
||||
" else: # if speaker_fileid is not specificated use the first sample in speakers.json\n",
|
||||
" choise_speaker = list(speaker_mapping.keys())[0]\n",
|
||||
" print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n",
|
||||
" speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n",
|
||||
" speaker_embedding_dim = len(speaker_embedding)\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n",
|
||||
"cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
|
||||
"model.load_state_dict(cp['model'])\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"if USE_CUDA:\n",
|
||||
" model.cuda()\n",
|
||||
"\n",
|
||||
"model.decoder.set_r(cp['r'])\n",
|
||||
"\n",
|
||||
"# load vocoder model\n",
|
||||
"if VOCODER_PATH!= \"\":\n",
|
||||
" VC = load_config(VOCODER_CONFIG_PATH)\n",
|
||||
" vocoder_model = setup_generator(VC)\n",
|
||||
" vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n",
|
||||
" vocoder_model.remove_weight_norm()\n",
|
||||
" if USE_CUDA:\n",
|
||||
" vocoder_model.cuda()\n",
|
||||
" vocoder_model.eval()\n",
|
||||
"else:\n",
|
||||
" vocoder_model = None\n",
|
||||
" VC = None\n",
|
||||
"\n",
|
||||
"# synthesize voice\n",
|
||||
"use_griffin_lim = VOCODER_PATH== \"\"\n",
|
||||
"\n",
|
||||
"if not C.use_external_speaker_embedding_file:\n",
|
||||
" if SPEAKER_FILEID.isdigit():\n",
|
||||
" SPEAKER_FILEID = int(SPEAKER_FILEID)\n",
|
||||
" else:\n",
|
||||
" SPEAKER_FILEID = None\n",
|
||||
"else:\n",
|
||||
" SPEAKER_FILEID = None\n"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "tNvVEoE30qY6",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"Synthesize sentence with Speaker\n",
|
||||
"\n",
|
||||
"> Stop running the cell to leave!\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "2o8fXkVSyXOa",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "vnV-FigfvsS2",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# **Select Speaker**\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "RuCGOnJ_fgDV",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"# VCTK speakers not seen in training (new speakers)\n",
|
||||
"VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n",
|
||||
"\n",
|
||||
"# VCTK speakers seen in training\n",
|
||||
"VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "hkvv7gRcx4WV",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"## **Example select a VCTK seen speaker in training**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "BviNMI9UyCYz",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# get embedding\n",
|
||||
"Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" speaker_embeddings = []\n",
|
||||
" for key in list(speaker_mapping.keys()):\n",
|
||||
" if Speaker_choise in key:\n",
|
||||
" if len(speaker_embeddings) < num_samples_speaker:\n",
|
||||
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
|
||||
" # takes the average of the embedings samples of the announcers\n",
|
||||
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
|
||||
" "
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "5e5_XnLsx3jg",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "QJ6VgT2a4vHW"
|
||||
},
|
||||
"source": [
|
||||
"## **Example select a VCTK not seen speaker in training (new Speakers)**\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"> Fitting new Speakers :)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"colab_type": "code",
|
||||
"id": "SZS57ZK-4vHa",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# get embedding\n",
|
||||
"Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" speaker_embeddings = []\n",
|
||||
" for key in list(speaker_mapping.keys()):\n",
|
||||
" if Speaker_choise in key:\n",
|
||||
" if len(speaker_embeddings) < num_samples_speaker:\n",
|
||||
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
|
||||
" # takes the average of the embedings samples of the announcers\n",
|
||||
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
|
||||
" "
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"colab_type": "code",
|
||||
"id": "bbs85vzz4vHo",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "LEE6mQLh5Who"
|
||||
},
|
||||
"source": [
|
||||
"# **Example Synthesizing with your own voice :)**\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "La70gSB65nrs",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
" Download and load GE2E Speaker Encoder "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "r0IEFZ0B5vQg",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n",
|
||||
"!unzip ./SpeakerEncoder-checkpoint.zip"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "jEH8HCTh5mF6",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n",
|
||||
"SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n",
|
||||
"SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n",
|
||||
"USE_CUDA = True"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "tOwkfQqT6-Qo",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
|
||||
"se_config = load_config(SE_CONFIG_PATH)\n",
|
||||
"se_ap = AudioProcessor(**se_config['audio'])\n",
|
||||
"\n",
|
||||
"se_model = SpeakerEncoder(**se_config.model)\n",
|
||||
"se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n",
|
||||
"se_model.eval()\n",
|
||||
"if USE_CUDA:\n",
|
||||
" se_model.cuda()"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "0TLlbUFG8O36",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"Upload a wav audio file in your voice.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "_FWwHPjJ8NXl",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"from google.colab import files\n",
|
||||
"file_list = files.upload()"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "WWOf6sgbBbGY",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# extract embedding from wav files\n",
|
||||
"speaker_embeddings = []\n",
|
||||
"for name in file_list.keys():\n",
|
||||
" if '.wav' in name:\n",
|
||||
" mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n",
|
||||
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
|
||||
" if USE_CUDA:\n",
|
||||
" mel_spec = mel_spec.cuda()\n",
|
||||
" embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
|
||||
" speaker_embeddings.append(embedd)\n",
|
||||
" else:\n",
|
||||
" print(\" You need upload Wav files, others files is not supported !!\")\n",
|
||||
"\n",
|
||||
"# takes the average of the embedings samples of the announcers\n",
|
||||
"speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"colab_type": "code",
|
||||
"id": "xmItcGac5WiG",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,834 +0,0 @@
|
|||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018-With-GST.ipynb",
|
||||
"provenance": [],
|
||||
"collapsed_sections": [
|
||||
"yZK6UdwSFnOO",
|
||||
"ENA2OumIVeMA",
|
||||
"dV6cXXlfi72r",
|
||||
"vnV-FigfvsS2",
|
||||
"g_G_HweN04W-",
|
||||
"LEE6mQLh5Who"
|
||||
],
|
||||
"toc_visible": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "yZK6UdwSFnOO",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# **Download and install Mozilla TTS**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "yvb0pX3WY6MN",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import os \n",
|
||||
"!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "iB9nl2UEG3SY",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"!apt-get install espeak\n",
|
||||
"os.chdir('TTS')\n",
|
||||
"!pip install -r requirements.txt\n",
|
||||
"!python setup.py develop\n",
|
||||
"os.chdir('..')"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "w6Krn8k1inC_",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"**Download Checkpoint**\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "PiYHf3lKhi9z",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018-with-GST.zip\n",
|
||||
"!unzip ./TTS-checkpoint.zip\n",
|
||||
"\n",
|
||||
"# Download gst style example\n",
|
||||
"!wget https://github.com/Edresson/TTS/releases/download/v1.0.0/gst-style-example.wav"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "MpYNgqrZcJKn",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"**Utils Functions**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "4KZA4b_CbMqx",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"import argparse\n",
|
||||
"import json\n",
|
||||
"# pylint: disable=redefined-outer-name, unused-argument\n",
|
||||
"import os\n",
|
||||
"import string\n",
|
||||
"import time\n",
|
||||
"import sys\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"TTS_PATH = \"../content/TTS\"\n",
|
||||
"# add libraries into environment\n",
|
||||
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"from TTS.tts.utils.generic_utils import setup_model\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis\n",
|
||||
"from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.vocoder.utils.generic_utils import setup_generator\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n",
|
||||
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
|
||||
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
|
||||
" if use_cuda and not use_gl:\n",
|
||||
" waveform = waveform.cpu()\n",
|
||||
" if not use_gl:\n",
|
||||
" waveform = waveform.numpy()\n",
|
||||
" waveform = waveform.squeeze()\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" print(\" > Time per step: {}\".format(tps))\n",
|
||||
" return waveform\n",
|
||||
"\n"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "ENA2OumIVeMA",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# **Vars definitions**\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "jPD0d_XpVXmY",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"TEXT = ''\n",
|
||||
"OUT_PATH = 'tests-audios/'\n",
|
||||
"# create output path\n",
|
||||
"os.makedirs(OUT_PATH, exist_ok=True)\n",
|
||||
"\n",
|
||||
"SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n",
|
||||
"\n",
|
||||
"# model vars \n",
|
||||
"MODEL_PATH = 'best_model.pth.tar'\n",
|
||||
"CONFIG_PATH = 'config.json'\n",
|
||||
"SPEAKER_JSON = 'speakers.json'\n",
|
||||
"\n",
|
||||
"# vocoder vars\n",
|
||||
"VOCODER_PATH = ''\n",
|
||||
"VOCODER_CONFIG_PATH = ''\n",
|
||||
"\n",
|
||||
"USE_CUDA = True"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "dV6cXXlfi72r",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# **Restore TTS Model**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "x1WgLFauWUPe",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# load the config\n",
|
||||
"C = load_config(CONFIG_PATH)\n",
|
||||
"C.forward_attn_mask = True\n",
|
||||
"\n",
|
||||
"# load the audio processor\n",
|
||||
"ap = AudioProcessor(**C.audio)\n",
|
||||
"\n",
|
||||
"# if the vocabulary was passed, replace the default\n",
|
||||
"if 'characters' in C.keys():\n",
|
||||
" symbols, phonemes = make_symbols(**C.characters)\n",
|
||||
"\n",
|
||||
"speaker_embedding = None\n",
|
||||
"speaker_embedding_dim = None\n",
|
||||
"num_speakers = 0\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" num_speakers = len(speaker_mapping)\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" if SPEAKER_FILEID is not None:\n",
|
||||
" speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n",
|
||||
" else: # if speaker_fileid is not specificated use the first sample in speakers.json\n",
|
||||
" choise_speaker = list(speaker_mapping.keys())[0]\n",
|
||||
" print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n",
|
||||
" speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n",
|
||||
" speaker_embedding_dim = len(speaker_embedding)\n",
|
||||
"\n",
|
||||
"# load the model\n",
|
||||
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",
|
||||
"model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n",
|
||||
"cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
|
||||
"model.load_state_dict(cp['model'])\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"if USE_CUDA:\n",
|
||||
" model.cuda()\n",
|
||||
"\n",
|
||||
"model.decoder.set_r(cp['r'])\n",
|
||||
"\n",
|
||||
"# load vocoder model\n",
|
||||
"if VOCODER_PATH!= \"\":\n",
|
||||
" VC = load_config(VOCODER_CONFIG_PATH)\n",
|
||||
" vocoder_model = setup_generator(VC)\n",
|
||||
" vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n",
|
||||
" vocoder_model.remove_weight_norm()\n",
|
||||
" if USE_CUDA:\n",
|
||||
" vocoder_model.cuda()\n",
|
||||
" vocoder_model.eval()\n",
|
||||
"else:\n",
|
||||
" vocoder_model = None\n",
|
||||
" VC = None\n",
|
||||
"\n",
|
||||
"# synthesize voice\n",
|
||||
"use_griffin_lim = VOCODER_PATH== \"\"\n",
|
||||
"\n",
|
||||
"if not C.use_external_speaker_embedding_file:\n",
|
||||
" if SPEAKER_FILEID.isdigit():\n",
|
||||
" SPEAKER_FILEID = int(SPEAKER_FILEID)\n",
|
||||
" else:\n",
|
||||
" SPEAKER_FILEID = None\n",
|
||||
"else:\n",
|
||||
" SPEAKER_FILEID = None\n"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "tNvVEoE30qY6",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"Synthesize sentence with Speaker\n",
|
||||
"\n",
|
||||
"> Stop running the cell to leave!\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "2o8fXkVSyXOa",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n",
|
||||
"gst_style = 'gst-style-example.wav'\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "vnV-FigfvsS2",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# **Select Speaker**\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "RuCGOnJ_fgDV",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"# VCTK speakers not seen in training (new speakers)\n",
|
||||
"VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n",
|
||||
"\n",
|
||||
"# VCTK speakers seen in training\n",
|
||||
"VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "hkvv7gRcx4WV",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"## **Example select a VCTK seen speaker in training**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "BviNMI9UyCYz",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# get embedding\n",
|
||||
"Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" speaker_embeddings = []\n",
|
||||
" for key in list(speaker_mapping.keys()):\n",
|
||||
" if Speaker_choise in key:\n",
|
||||
" if len(speaker_embeddings) < num_samples_speaker:\n",
|
||||
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
|
||||
" # takes the average of the embedings samples of the announcers\n",
|
||||
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
|
||||
" "
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "5e5_XnLsx3jg",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n",
|
||||
"gst_style = 'gst-style-example.wav'\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "QJ6VgT2a4vHW"
|
||||
},
|
||||
"source": [
|
||||
"## **Example select a VCTK not seen speaker in training (new Speakers)**\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"> Fitting new Speakers :)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"colab_type": "code",
|
||||
"id": "SZS57ZK-4vHa",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# get embedding\n",
|
||||
"Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n",
|
||||
"# load speakers\n",
|
||||
"if SPEAKER_JSON != '':\n",
|
||||
" speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n",
|
||||
" if C.use_external_speaker_embedding_file:\n",
|
||||
" speaker_embeddings = []\n",
|
||||
" for key in list(speaker_mapping.keys()):\n",
|
||||
" if Speaker_choise in key:\n",
|
||||
" if len(speaker_embeddings) < num_samples_speaker:\n",
|
||||
" speaker_embeddings.append(speaker_mapping[key]['embedding'])\n",
|
||||
" # takes the average of the embedings samples of the announcers\n",
|
||||
" speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n",
|
||||
" "
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"colab_type": "code",
|
||||
"id": "bbs85vzz4vHo",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"gst_style = 'gst-style-example.wav'\n",
|
||||
"while True:\n",
|
||||
" TEXT = input(\"Enter sentence: \")\n",
|
||||
" print(\" > Text: {}\".format(TEXT))\n",
|
||||
" wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
" IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
" # save the results\n",
|
||||
" file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
" file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
" out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
" print(\" > Saving output to {}\".format(out_path))\n",
|
||||
" ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "g_G_HweN04W-",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# **Changing GST tokens manually (without wav reference)**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "jyFP5syW2bjt",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"You can define tokens manually, this way you can increase/decrease the function of a given GST token. For example a token is responsible for the length of the speaker's pauses, if you increase the value of that token you will have longer pauses and if you decrease it you will have shorter pauses."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "SpwjDjCM2a3Y",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# set gst tokens, in this model we have 5 tokens\n",
|
||||
"gst_style = {\"0\": 0, \"1\": 0, \"3\": 0, \"4\": 0}"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "qWChMbI_0z5X",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "uFjUi9xQ3mG3",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"gst_style = {\"0\": 0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "Uw0d6gWg4L27",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"gst_style = {\"0\": -0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "V9izw4-54-Tl",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"gst_style = {\"0\": 0, \"1\": 0.9, \"3\": 0, \"4\": 0}\n",
|
||||
"print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "LEE6mQLh5Who"
|
||||
},
|
||||
"source": [
|
||||
"# **Example Synthesizing with your own voice :)**\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "La70gSB65nrs",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
" Download and load GE2E Speaker Encoder "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "r0IEFZ0B5vQg",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n",
|
||||
"!unzip ./SpeakerEncoder-checkpoint.zip"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "jEH8HCTh5mF6",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n",
|
||||
"SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n",
|
||||
"SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n",
|
||||
"USE_CUDA = True"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "tOwkfQqT6-Qo",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
|
||||
"se_config = load_config(SE_CONFIG_PATH)\n",
|
||||
"se_ap = AudioProcessor(**se_config['audio'])\n",
|
||||
"\n",
|
||||
"se_model = SpeakerEncoder(**se_config.model)\n",
|
||||
"se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n",
|
||||
"se_model.eval()\n",
|
||||
"if USE_CUDA:\n",
|
||||
" se_model.cuda()"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "0TLlbUFG8O36",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"Upload one or more wav audio files in your voice.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "_FWwHPjJ8NXl",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# select one or more wav files\n",
|
||||
"from google.colab import files\n",
|
||||
"file_list = files.upload()"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "WWOf6sgbBbGY",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# extract embedding from wav files\n",
|
||||
"speaker_embeddings = []\n",
|
||||
"for name in file_list.keys():\n",
|
||||
" if '.wav' in name:\n",
|
||||
" mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n",
|
||||
" mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
|
||||
" if USE_CUDA:\n",
|
||||
" mel_spec = mel_spec.cuda()\n",
|
||||
" embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
|
||||
" speaker_embeddings.append(embedd)\n",
|
||||
" else:\n",
|
||||
" print(\"You need upload Wav files, others files is not supported !!\")\n",
|
||||
"\n",
|
||||
"# takes the average of the embedings samples of the announcers\n",
|
||||
"speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "AQ7eP31d9yzq",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"from IPython.display import Audio\n",
|
||||
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"gst_style = {\"0\": 0, \"1\": 0.0, \"3\": 0, \"4\": 0}\n",
|
||||
"gst_style = 'gst-style-example.wav'\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "11i10yE1-LMJ",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"Uploading your own GST reference wav file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "eKohSQG1-KkT",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# select one wav file for GST reference\n",
|
||||
"from google.colab import files\n",
|
||||
"file_list = files.upload()\n"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"colab_type": "code",
|
||||
"id": "xmItcGac5WiG",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n",
|
||||
"gst_style = list(file_list.keys())[0]\n",
|
||||
"TEXT = input(\"Enter sentence: \")\n",
|
||||
"print(\" > Text: {}\".format(TEXT))\n",
|
||||
"wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n",
|
||||
"IPython.display.display(Audio(wav, rate=ap.sample_rate))\n",
|
||||
"# save the results\n",
|
||||
"file_name = TEXT.replace(\" \", \"_\")\n",
|
||||
"file_name = file_name.translate(\n",
|
||||
" str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n",
|
||||
"out_path = os.path.join(OUT_PATH, file_name)\n",
|
||||
"print(\" > Saving output to {}\".format(out_path))\n",
|
||||
"ap.save_wav(wav, out_path)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
]
|
||||
}
|
|
@ -346,7 +346,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -15,7 +15,7 @@
|
|||
"- to set the right paths in the cell below.\n",
|
||||
"\n",
|
||||
"Repository:\n",
|
||||
"- TTS: https://github.com/mozilla/TTS"
|
||||
"- TTS: https://github.com/coqui/TTS"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -58,7 +58,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_360-half-September-28-2019_10+46AM-8565c50-20200323T115637Z-001/\"\n",
|
||||
"MODEL_RUN_PATH = \"../../Coqui-TTS/checkpoints/libritts_360-half-September-28-2019_10+46AM-8565c50-20200323T115637Z-001/\"\n",
|
||||
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
|
||||
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
|
||||
"\n",
|
||||
|
@ -155,7 +155,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6"
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -181,7 +181,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.7"
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -1,412 +1,425 @@
|
|||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "Tutorial_Converting_PyTorch_to_TF_to_TFlite.ipynb",
|
||||
"provenance": [],
|
||||
"collapsed_sections": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
}
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "6LWsNd3_M3MP"
|
||||
},
|
||||
"source": [
|
||||
"# Converting Pytorch models to Tensorflow and TFLite by CoquiTTS"
|
||||
]
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "6LWsNd3_M3MP",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# Converting Pytorch models to Tensorflow and TFLite by MozillaTTS"
|
||||
]
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "FAqrSIWgLyP0"
|
||||
},
|
||||
"source": [
|
||||
"This is a tutorial demonstrating Coqui TTS capabilities to convert \n",
|
||||
"trained PyTorch models to Tensorflow and Tflite.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "MBJjGYnoEo4v"
|
||||
},
|
||||
"source": [
|
||||
"# Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Ku-dA4DKoeXk"
|
||||
},
|
||||
"source": [
|
||||
"### Download TF Models and configs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 162
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "FAqrSIWgLyP0",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"This is a tutorial demonstrating Mozilla TTS capabilities to convert \n",
|
||||
"trained PyTorch models to Tensorflow and Tflite.\n"
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "jGIgnWhGsxU1",
|
||||
"outputId": "b461952f-8507-4dd2-af06-4e6b8692765d",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
|
||||
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 235
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "MBJjGYnoEo4v",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# Installation"
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "4dnpE0-kvTsu",
|
||||
"outputId": "f67c3138-bda0-4b3e-ffcc-647f9feec23e",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n",
|
||||
"!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n",
|
||||
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "3IGvvCRMEwqn"
|
||||
},
|
||||
"source": [
|
||||
"# Model Conversion PyTorch -> TF -> TFLite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "tLhz8SAf8Pgp"
|
||||
},
|
||||
"source": [
|
||||
"## Converting PyTorch to Tensorflow\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 1000
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Ku-dA4DKoeXk",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"### Download TF Models and configs"
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "Xsrvr_WQ8Ib5",
|
||||
"outputId": "dae96616-e5f7-41b6-cdb9-5026cfcd3214",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# convert TTS model to Tensorflow\n",
|
||||
"!python ../TTS/bin/convert_tacotron2_torch_to_tf.py --config_path data/config.json --torch_model_path data/tts_model.pth.tar --output_path data/tts_model_tf.pkl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 1000
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "jGIgnWhGsxU1",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 162
|
||||
},
|
||||
"outputId": "b461952f-8507-4dd2-af06-4e6b8692765d",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
|
||||
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"colab_type": "code",
|
||||
"id": "VJ4NA5If9ljv",
|
||||
"outputId": "1520dca8-1db8-4e07-bc0c-b1d5941c775e",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# convert Vocoder model to Tensorflow\n",
|
||||
"!python ../TTS/bin/convert_melgan_torch_to_tf.py --config_path data/config_vocoder.json --torch_model_path data/vocoder_model.pth.tar --output_path data/vocoder_model_tf.pkl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "7d5vTkBZ-BYQ"
|
||||
},
|
||||
"source": [
|
||||
"## Converting Tensorflow to TFLite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 927
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "4dnpE0-kvTsu",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 235
|
||||
},
|
||||
"outputId": "f67c3138-bda0-4b3e-ffcc-647f9feec23e",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n",
|
||||
"!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n",
|
||||
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
"colab_type": "code",
|
||||
"id": "33hTfpuU99cg",
|
||||
"outputId": "8a0e5be1-23a2-4128-ee37-8232adcb8ff0",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# convert TTS model to TFLite\n",
|
||||
"!python ../TTS/bin/convert_tacotron2_tflite.py --config_path data/config.json --tf_model data/tts_model_tf.pkl --output_path data/tts_model.tflite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 364
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "3IGvvCRMEwqn",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# Model Conversion PyTorch -> TF -> TFLite"
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "e00Hm75Y-wZ2",
|
||||
"outputId": "42381b05-3c9d-44f0-dac7-d81efd95eadf",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# convert Vocoder model to TFLite\n",
|
||||
"!python ../TTS/bin/convert_melgan_tflite.py --config_path data/config_vocoder.json --tf_model data/vocoder_model_tf.pkl --output_path data/vocoder_model.tflite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Zlgi8fPdpRF0"
|
||||
},
|
||||
"source": [
|
||||
"# Run Inference with TFLite "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "f-Yc42nQZG5A"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def run_vocoder(mel_spec):\n",
|
||||
" vocoder_inputs = mel_spec[None, :, :]\n",
|
||||
" # get input and output details\n",
|
||||
" input_details = vocoder_model.get_input_details()\n",
|
||||
" # reshape input tensor for the new input shape\n",
|
||||
" vocoder_model.resize_tensor_input(input_details[0]['index'], vocoder_inputs.shape)\n",
|
||||
" vocoder_model.allocate_tensors()\n",
|
||||
" detail = input_details[0]\n",
|
||||
" vocoder_model.set_tensor(detail['index'], vocoder_inputs)\n",
|
||||
" # run the model\n",
|
||||
" vocoder_model.invoke()\n",
|
||||
" # collect outputs\n",
|
||||
" output_details = vocoder_model.get_output_details()\n",
|
||||
" waveform = vocoder_model.get_tensor(output_details[0]['index'])\n",
|
||||
" return waveform \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tts(model, text, CONFIG, p):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n",
|
||||
" backend='tflite')\n",
|
||||
" waveform = run_vocoder(mel_postnet_spec.T)\n",
|
||||
" waveform = waveform[0, 0]\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
||||
" print(waveform.shape)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" print(\" > Time per step: {}\".format(tps))\n",
|
||||
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
|
||||
" return alignment, mel_postnet_spec, stop_tokens, waveform"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "ZksegYQepkFg"
|
||||
},
|
||||
"source": [
|
||||
"### Load TF Models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "oVa0kOamprgj"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import time\n",
|
||||
"import IPython\n",
|
||||
"\n",
|
||||
"from TTS.tts.tf.utils.tflite import load_tflite_model\n",
|
||||
"from TTS.tts.tf.utils.io import load_checkpoint\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "EY-sHVO8IFSH"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# runtime settings\n",
|
||||
"use_cuda = False"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "_1aIUp2FpxOQ"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# model paths\n",
|
||||
"TTS_MODEL = \"data/tts_model.tflite\"\n",
|
||||
"TTS_CONFIG = \"data/config.json\"\n",
|
||||
"VOCODER_MODEL = \"data/vocoder_model.tflite\"\n",
|
||||
"VOCODER_CONFIG = \"data/config_vocoder.json\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "CpgmdBVQplbv"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load configs\n",
|
||||
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
|
||||
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 471
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "tLhz8SAf8Pgp",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"## Converting PyTorch to Tensorflow\n"
|
||||
]
|
||||
"colab_type": "code",
|
||||
"id": "zmrQxiozIUVE",
|
||||
"outputId": "21cda136-de87-4d55-fd46-7d5306103d90",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load the audio processor\n",
|
||||
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
|
||||
"ap = AudioProcessor(**TTS_CONFIG.audio) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "8fLoI4ipqMeS"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# LOAD TTS MODEL\n",
|
||||
"# multi speaker \n",
|
||||
"speaker_id = None\n",
|
||||
"speakers = []\n",
|
||||
"\n",
|
||||
"# load the models\n",
|
||||
"model = load_tflite_model(TTS_MODEL)\n",
|
||||
"vocoder_model = load_tflite_model(VOCODER_MODEL)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"colab_type": "text",
|
||||
"id": "Ws_YkPKsLgo-"
|
||||
},
|
||||
"source": [
|
||||
"## Run Sample Sentence"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 134
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "Xsrvr_WQ8Ib5",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 1000
|
||||
},
|
||||
"outputId": "dae96616-e5f7-41b6-cdb9-5026cfcd3214",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# convert TTS model to Tensorflow\n",
|
||||
"!python ../TTS/bin/convert_tacotron2_torch_to_tf.py --config_path data/config.json --torch_model_path data/tts_model.pth.tar --output_path data/tts_model_tf.pkl"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "VJ4NA5If9ljv",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 1000
|
||||
},
|
||||
"outputId": "1520dca8-1db8-4e07-bc0c-b1d5941c775e",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# convert Vocoder model to Tensorflow\n",
|
||||
"!python ../TTS/bin/convert_melgan_torch_to_tf.py --config_path data/config_vocoder.json --torch_model_path data/vocoder_model.pth.tar --output_path data/vocoder_model_tf.pkl"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "7d5vTkBZ-BYQ",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"## Converting Tensorflow to TFLite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "33hTfpuU99cg",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 927
|
||||
},
|
||||
"outputId": "8a0e5be1-23a2-4128-ee37-8232adcb8ff0",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# convert TTS model to TFLite\n",
|
||||
"!python ../TTS/bin/convert_tacotron2_tflite.py --config_path data/config.json --tf_model data/tts_model_tf.pkl --output_path data/tts_model.tflite"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "e00Hm75Y-wZ2",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 364
|
||||
},
|
||||
"outputId": "42381b05-3c9d-44f0-dac7-d81efd95eadf",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# convert Vocoder model to TFLite\n",
|
||||
"!python ../TTS/bin/convert_melgan_tflite.py --config_path data/config_vocoder.json --tf_model data/vocoder_model_tf.pkl --output_path data/vocoder_model.tflite"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Zlgi8fPdpRF0",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"# Run Inference with TFLite "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "f-Yc42nQZG5A",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"def run_vocoder(mel_spec):\n",
|
||||
" vocoder_inputs = mel_spec[None, :, :]\n",
|
||||
" # get input and output details\n",
|
||||
" input_details = vocoder_model.get_input_details()\n",
|
||||
" # reshape input tensor for the new input shape\n",
|
||||
" vocoder_model.resize_tensor_input(input_details[0]['index'], vocoder_inputs.shape)\n",
|
||||
" vocoder_model.allocate_tensors()\n",
|
||||
" detail = input_details[0]\n",
|
||||
" vocoder_model.set_tensor(detail['index'], vocoder_inputs)\n",
|
||||
" # run the model\n",
|
||||
" vocoder_model.invoke()\n",
|
||||
" # collect outputs\n",
|
||||
" output_details = vocoder_model.get_output_details()\n",
|
||||
" waveform = vocoder_model.get_tensor(output_details[0]['index'])\n",
|
||||
" return waveform \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tts(model, text, CONFIG, p):\n",
|
||||
" t_1 = time.time()\n",
|
||||
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
|
||||
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n",
|
||||
" backend='tflite')\n",
|
||||
" waveform = run_vocoder(mel_postnet_spec.T)\n",
|
||||
" waveform = waveform[0, 0]\n",
|
||||
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
|
||||
" tps = (time.time() - t_1) / len(waveform)\n",
|
||||
" print(waveform.shape)\n",
|
||||
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
|
||||
" print(\" > Real-time factor: {}\".format(rtf))\n",
|
||||
" print(\" > Time per step: {}\".format(tps))\n",
|
||||
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
|
||||
" return alignment, mel_postnet_spec, stop_tokens, waveform"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "ZksegYQepkFg",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"### Load TF Models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "oVa0kOamprgj",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import time\n",
|
||||
"import IPython\n",
|
||||
"\n",
|
||||
"from TTS.tts.tf.utils.tflite import load_tflite_model\n",
|
||||
"from TTS.tts.tf.utils.io import load_checkpoint\n",
|
||||
"from TTS.utils.io import load_config\n",
|
||||
"from TTS.tts.utils.text.symbols import symbols, phonemes\n",
|
||||
"from TTS.utils.audio import AudioProcessor\n",
|
||||
"from TTS.tts.utils.synthesis import synthesis"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "EY-sHVO8IFSH",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# runtime settings\n",
|
||||
"use_cuda = False"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "_1aIUp2FpxOQ",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# model paths\n",
|
||||
"TTS_MODEL = \"data/tts_model.tflite\"\n",
|
||||
"TTS_CONFIG = \"data/config.json\"\n",
|
||||
"VOCODER_MODEL = \"data/vocoder_model.tflite\"\n",
|
||||
"VOCODER_CONFIG = \"data/config_vocoder.json\""
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "CpgmdBVQplbv",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# load configs\n",
|
||||
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
|
||||
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "zmrQxiozIUVE",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 471
|
||||
},
|
||||
"outputId": "21cda136-de87-4d55-fd46-7d5306103d90",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# load the audio processor\n",
|
||||
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
|
||||
"ap = AudioProcessor(**TTS_CONFIG.audio) "
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "8fLoI4ipqMeS",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"# LOAD TTS MODEL\n",
|
||||
"# multi speaker \n",
|
||||
"speaker_id = None\n",
|
||||
"speakers = []\n",
|
||||
"\n",
|
||||
"# load the models\n",
|
||||
"model = load_tflite_model(TTS_MODEL)\n",
|
||||
"vocoder_model = load_tflite_model(VOCODER_MODEL)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Ws_YkPKsLgo-",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"## Run Sample Sentence"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "FuWxZ9Ey5Puj",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 134
|
||||
},
|
||||
"outputId": "535c2df1-c27c-458b-e14b-41a977635aa1",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
]
|
||||
}
|
||||
"colab_type": "code",
|
||||
"id": "FuWxZ9Ey5Puj",
|
||||
"outputId": "535c2df1-c27c-458b-e14b-41a977635aa1",
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
|
||||
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"collapsed_sections": [],
|
||||
"name": "Tutorial_Converting_PyTorch_to_TF_to_TFlite.ipynb",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue