diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb deleted file mode 100644 index 1be93a82..00000000 --- a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb +++ /dev/null @@ -1,606 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "6LWsNd3_M3MP" - }, - "source": [ - "# Mozilla TTS on CPU Real-Time Speech Synthesis " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "FAqrSIWgLyP0" - }, - "source": [ - "We use Tacotron2 and MultiBand-Melgan models and Baker dataset (chinese mandarin).\n", - "\n", - "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 126K steps (3 days) with a single GPU.\n", - "\n", - "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", - "\n", - "Note that both model performances can be improved with more training." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ku-dA4DKoeXk" - }, - "source": [ - "### Download Models" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Zlgi8fPdpRF0" - }, - "source": [ - "### Define TTS function" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ZksegYQepkFg" - }, - "source": [ - "### Load Models" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "oVa0kOamprgj" - }, - "outputs": [], - "source": [ - "import os\n", - "import torch\n", - "import IPython\n", - "\n", - "from TTS.utils.synthesizer import Synthesizer\n", - "from TTS.utils.manage import ModelManager\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "EY-sHVO8IFSH" - }, - "outputs": [], - "source": [ - "# runtime settings\n", - "use_cuda = False" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# tts and vocoder name\n", - "TTS_NAME = \"tts_models/zh-CN/baker/tacotron2-DDC-GST\"\n", - "VOCODER_NAME = \"vocoder_models/en/ljspeech/multiband-melgan\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "manager = ModelManager(\"../TTS/.models.json\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > tts_models/zh-CN/baker/tacotron2-DDC-GST is already downloaded.\n", - " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n" - ] - } - ], - "source": [ - "tts_checkpoint_file, tts_config_file, tts_json_dict = manager.download_model(TTS_NAME)\n", - "vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(VOCODER_NAME)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Using model: tacotron2\n", - " > Generator Model: multiband_melgan_generator\n" - ] - } - ], - "source": [ - "synthesizer = Synthesizer(tts_checkpoint_file, tts_config_file, vocoder_checkpoint_file, vocoder_config_file, use_cuda)\n", - "sample_rate = synthesizer.tts_config.audio[\"sample_rate\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ws_YkPKsLgo-" - }, - "source": [ - "## Run Inference" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# Here some test sentences for you to play with :\n", - "sentences= [\"我从来不会说很标准的中文。\",\n", - "\"我喜欢听人工智能的博客。\",\n", - "\"我来自一个法国郊区的地方。\",\n", - "\"不比不知道,一比吓一跳!\",\n", - "\"台湾是一个真的很好玩的地方!\",\n", - "\"干一行,行一行,行行都行。\",\n", - "\"我要盖被子,好尴尬!\",]" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['我从来不会说很标准的中文。']\n", - " > Processing time: 1.6665124893188477\n", - " > Real-time factor: 0.5583910829911347\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['我喜欢听人工智能的博客。']\n", - " > Processing time: 1.4052538871765137\n", - " > Real-time factor: 0.5193391025114328\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['我来自一个法国郊区的地方。']\n", - " > Processing time: 1.605910062789917\n", - " > Real-time factor: 0.5785999490934259\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['不比不知道,一比吓一跳!']\n", - " > Processing time: 1.9105627536773682\n", - " > Real-time factor: 0.6607262973429417\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['台湾是一个真的很好玩的地方!']\n", - " > Processing time: 1.3081049919128418\n", - " > Real-time factor: 0.4218891158389621\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['干一行,行一行,行行都行。']\n", - " > Processing time: 2.0958540439605713\n", - " > Real-time factor: 0.6709288860239634\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['我要盖被子,好尴尬!']\n", - " > Processing time: 1.5188167095184326\n", - " > Real-time factor: 0.6257456734843319\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "for sentence in sentences:\n", - " wav = synthesizer.tts(sentence)\n", - " IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) \n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['我喜欢听人工智能的博客。']\n", - " > Processing time: 2.114016056060791\n", - " > Real-time factor: 0.643271887228699\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# you can also play with Global Style Token (GST) by feeding a \n", - "# ... wav_style parameter to the tts method\n", - "\n", - "style_wav = {\"2\": 0.2}\n", - "\n", - "wav = synthesizer.tts(sentences[1], style_wav=style_wav)\n", - "IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) " - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['我喜欢听人工智能的博客。']\n", - " > Processing time: 1.5687272548675537\n", - " > Real-time factor: 0.6401842606201799\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['我喜欢听人工智能的博客。']\n", - " > Processing time: 2.070594072341919\n", - " > Real-time factor: 0.8067677285683367\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['我喜欢听人工智能的博客。']\n", - " > Processing time: 1.3769311904907227\n", - " > Real-time factor: 0.5088718951180015\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['我喜欢听人工智能的博客。']\n", - " > Processing time: 2.024374485015869\n", - " > Real-time factor: 0.6782983435843654\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "['我喜欢听人工智能的博客。']\n", - " > Processing time: 2.4434399604797363\n", - " > Real-time factor: 0.7435119663360867\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# On this model specifically, we can observe that the GSToken \"2\" is responsible for speech speed\n", - "# You can listen to these 5 different samples, the flow is slower and slower as the value is higher\n", - "for value in [-0.2, -0.1, 0, 0.1, 0.2]:\n", - " style_wav = {\"2\": value}\n", - " wav = synthesizer.tts(sentences[1], style_wav=style_wav)\n", - " IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb b/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb deleted file mode 100644 index 67171b0e..00000000 --- a/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb +++ /dev/null @@ -1,342 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "6LWsNd3_M3MP" - }, - "source": [ - "# Mozilla TTS on CPU Real-Time Speech Synthesis " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "FAqrSIWgLyP0" - }, - "source": [ - "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", - "\n", - "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", - "\n", - "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", - "\n", - "Note that both model performances can be improved with more training." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ku-dA4DKoeXk" - }, - "source": [ - "### Download Models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 162 - }, - "colab_type": "code", - "id": "jGIgnWhGsxU1", - "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe", - "tags": [] - }, - "outputs": [], - "source": [ - "!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n", - "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 235 - }, - "colab_type": "code", - "id": "4dnpE0-kvTsu", - "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e", - "tags": [] - }, - "outputs": [], - "source": [ - "!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n", - "!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n", - "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Zlgi8fPdpRF0" - }, - "source": [ - "### Define TTS function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "f-Yc42nQZG5A" - }, - "outputs": [], - "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", - " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", - " # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n", - " if not use_gl:\n", - " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", - " waveform = waveform.flatten()\n", - " if use_cuda:\n", - " waveform = waveform.cpu()\n", - " waveform = waveform.numpy()\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " tps = (time.time() - t_1) / len(waveform)\n", - " print(waveform.shape)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " print(\" > Time per step: {}\".format(tps))\n", - " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ZksegYQepkFg" - }, - "source": [ - "### Load Models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "oVa0kOamprgj" - }, - "outputs": [], - "source": [ - "import os\n", - "import torch\n", - "import time\n", - "import IPython\n", - "\n", - "from TTS.tts.utils.generic_utils import setup_model\n", - "from TTS.utils.io import load_config\n", - "from TTS.tts.utils.text.symbols import symbols, phonemes\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.tts.utils.synthesis import synthesis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "EY-sHVO8IFSH" - }, - "outputs": [], - "source": [ - "# runtime settings\n", - "use_cuda = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_1aIUp2FpxOQ" - }, - "outputs": [], - "source": [ - "# model paths\n", - "TTS_MODEL = \"data/tts_model.pth.tar\"\n", - "TTS_CONFIG = \"data/config.json\"\n", - "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n", - "VOCODER_CONFIG = \"data/config_vocoder.json\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CpgmdBVQplbv" - }, - "outputs": [], - "source": [ - "# load configs\n", - "TTS_CONFIG = load_config(TTS_CONFIG)\n", - "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 471 - }, - "colab_type": "code", - "id": "zmrQxiozIUVE", - "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49", - "tags": [] - }, - "outputs": [], - "source": [ - "# load the audio processor\n", - "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n", - "ap = AudioProcessor(**TTS_CONFIG.audio) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "colab_type": "code", - "id": "8fLoI4ipqMeS", - "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382", - "tags": [] - }, - "outputs": [], - "source": [ - "# LOAD TTS MODEL\n", - "# multi speaker \n", - "speaker_id = None\n", - "speakers = []\n", - "\n", - "# load the model\n", - "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n", - "\n", - "# load model state\n", - "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n", - "\n", - "# load the model\n", - "model.load_state_dict(cp['model'])\n", - "if use_cuda:\n", - " model.cuda()\n", - "model.eval()\n", - "\n", - "# set model stepsize\n", - "if 'r' in cp:\n", - " model.decoder.set_r(cp['r'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "id": "zKoq0GgzqzhQ", - "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb", - "tags": [] - }, - "outputs": [], - "source": [ - "from TTS.vocoder.utils.generic_utils import setup_generator\n", - "\n", - "# LOAD VOCODER MODEL\n", - "vocoder_model = setup_generator(VOCODER_CONFIG)\n", - "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n", - "vocoder_model.remove_weight_norm()\n", - "vocoder_model.inference_padding = 0\n", - "\n", - "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", - "if use_cuda:\n", - " vocoder_model.cuda()\n", - "vocoder_model.eval()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ws_YkPKsLgo-" - }, - "source": [ - "## Run Inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 134 - }, - "colab_type": "code", - "id": "FuWxZ9Ey5Puj", - "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91", - "tags": [] - }, - "outputs": [], - "source": [ - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/DDC_TTS_and_MultiBand_MelGAN_TF_Example.ipynb b/notebooks/DDC_TTS_and_MultiBand_MelGAN_TF_Example.ipynb deleted file mode 100644 index 4b009ce9..00000000 --- a/notebooks/DDC_TTS_and_MultiBand_MelGAN_TF_Example.ipynb +++ /dev/null @@ -1,346 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false", - "colab_type": "text", - "id": "6LWsNd3_M3MP" - }, - "source": [ - "# Mozilla TTS on CPU Real-Time Speech Synthesis with Tensorflow" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false", - "colab_type": "text", - "id": "FAqrSIWgLyP0" - }, - "source": [ - "**These models are converted from released [PyTorch models](https://colab.research.google.com/drive/1u_16ZzHjKYFn1HNVuA4Qf_i2MMFB9olY?usp=sharing) using our TF utilities provided in Mozilla TTS.**\n", - "\n", - "These TF models support TF 2.2 and for different versions you might need to\n", - "regenerate them. \n", - "\n", - "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", - "\n", - "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", - "\n", - "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", - "\n", - "Note that both model performances can be improved with more training.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false", - "colab_type": "text", - "id": "Ku-dA4DKoeXk" - }, - "source": [ - "### Download Models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 162 - }, - "colab_type": "code", - "id": "jGIgnWhGsxU1", - "outputId": "08b0dddd-4edf-48c9-e8e5-a419b36a5c3d", - "tags": [] - }, - "outputs": [], - "source": [ - "!gdown --id 1p7OSEEW_Z7ORxNgfZwhMy7IiLE1s0aH7 -O data/tts_model.pkl\n", - "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 235 - }, - "colab_type": "code", - "id": "4dnpE0-kvTsu", - "outputId": "2fe836eb-c7e7-4f1e-9352-0142126bb19f", - "tags": [] - }, - "outputs": [], - "source": [ - "!gdown --id 1rHmj7CqD3Sfa716Y3ub_vpIBrQg_b1yF -O data/vocoder_model.pkl\n", - "!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n", - "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false", - "colab_type": "text", - "id": "Zlgi8fPdpRF0" - }, - "source": [ - "### Define TTS function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": {}, - "colab_type": "code", - "id": "f-Yc42nQZG5A" - }, - "outputs": [], - "source": [ - "def tts(model, text, CONFIG, p):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", - " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n", - " backend='tf')\n", - " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", - " waveform = waveform.numpy()[0, 0]\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " tps = (time.time() - t_1) / len(waveform)\n", - " print(waveform.shape)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " print(\" > Time per step: {}\".format(tps))\n", - " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false", - "colab_type": "text", - "id": "ZksegYQepkFg" - }, - "source": [ - "### Load Models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": {}, - "colab_type": "code", - "id": "oVa0kOamprgj" - }, - "outputs": [], - "source": [ - "import os\n", - "import torch\n", - "import time\n", - "import IPython\n", - "\n", - "from TTS.tts.tf.utils.generic_utils import setup_model\n", - "from TTS.tts.tf.utils.io import load_checkpoint\n", - "from TTS.utils.io import load_config\n", - "from TTS.tts.utils.text.symbols import symbols, phonemes\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.tts.utils.synthesis import synthesis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": {}, - "colab_type": "code", - "id": "EY-sHVO8IFSH" - }, - "outputs": [], - "source": [ - "# runtime settings\n", - "use_cuda = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": {}, - "colab_type": "code", - "id": "_1aIUp2FpxOQ" - }, - "outputs": [], - "source": [ - "# model paths\n", - "TTS_MODEL = \"data/tts_model.pkl\"\n", - "TTS_CONFIG = \"data/config.json\"\n", - "VOCODER_MODEL = \"data/vocoder_model.pkl\"\n", - "VOCODER_CONFIG = \"data/config_vocoder.json\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": {}, - "colab_type": "code", - "id": "CpgmdBVQplbv" - }, - "outputs": [], - "source": [ - "# load configs\n", - "TTS_CONFIG = load_config(TTS_CONFIG)\n", - "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 471 - }, - "colab_type": "code", - "id": "zmrQxiozIUVE", - "outputId": "fa71bd05-401f-4e5b-a6f7-60ae765966db", - "tags": [] - }, - "outputs": [], - "source": [ - "# load the audio processor\n", - "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n", - "ap = AudioProcessor(**TTS_CONFIG.audio) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 72 - }, - "colab_type": "code", - "id": "8fLoI4ipqMeS", - "outputId": "595d990f-930d-4698-ee14-77796b5eed7d", - "tags": [] - }, - "outputs": [], - "source": [ - "# LOAD TTS MODEL\n", - "# multi speaker \n", - "speaker_id = None\n", - "speakers = []\n", - "\n", - "# load the model\n", - "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n", - "model.build_inference()\n", - "model = load_checkpoint(model, TTS_MODEL)\n", - "model.decoder.set_max_decoder_steps(1000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 489 - }, - "colab_type": "code", - "id": "zKoq0GgzqzhQ", - "outputId": "2cc3deae-144f-4465-da3b-98628d948506" - }, - "outputs": [], - "source": [ - "from TTS.vocoder.tf.utils.generic_utils import setup_generator\n", - "from TTS.vocoder.tf.utils.io import load_checkpoint\n", - "\n", - "# LOAD VOCODER MODEL\n", - "vocoder_model = setup_generator(VOCODER_CONFIG)\n", - "vocoder_model.build_inference()\n", - "vocoder_model = load_checkpoint(vocoder_model, VOCODER_MODEL)\n", - "vocoder_model.inference_padding = 0\n", - "\n", - "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "Collapsed": "false", - "colab_type": "text", - "id": "Ws_YkPKsLgo-" - }, - "source": [ - "## Run Inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 134 - }, - "colab_type": "code", - "id": "FuWxZ9Ey5Puj", - "outputId": "07ede6e5-06e6-4612-f687-7984d20e5254" - }, - "outputs": [], - "source": [ - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "DDC-TTS_and_MultiBand-MelGAN_TF_Example.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb b/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb deleted file mode 100644 index 4c1008e0..00000000 --- a/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb +++ /dev/null @@ -1,342 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "6LWsNd3_M3MP" - }, - "source": [ - "# Mozilla TTS on CPU Real-Time Speech Synthesis " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "FAqrSIWgLyP0" - }, - "source": [ - "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", - "\n", - "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", - "\n", - "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", - "\n", - "Note that both model performances can be improved with more training." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ku-dA4DKoeXk" - }, - "source": [ - "### Download Models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 162 - }, - "colab_type": "code", - "id": "jGIgnWhGsxU1", - "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe", - "tags": [] - }, - "outputs": [], - "source": [ - "!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n", - "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 235 - }, - "colab_type": "code", - "id": "4dnpE0-kvTsu", - "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e", - "tags": [] - }, - "outputs": [], - "source": [ - "!gdown --id 1X09hHAyAJOnrplCUMAdW_t341Kor4YR4 -O data/vocoder_model.pth.tar\n", - "!gdown --id \"1qN7vQRIYkzvOX_DtiZtTajzoZ1eW1-Eg\" -O data/config_vocoder.json\n", - "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Zlgi8fPdpRF0" - }, - "source": [ - "### Define TTS function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "f-Yc42nQZG5A" - }, - "outputs": [], - "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", - " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", - " # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n", - " if not use_gl:\n", - " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", - " waveform = waveform.flatten()\n", - " if use_cuda:\n", - " waveform = waveform.cpu()\n", - " waveform = waveform.numpy()\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " tps = (time.time() - t_1) / len(waveform)\n", - " print(waveform.shape)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " print(\" > Time per step: {}\".format(tps))\n", - " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ZksegYQepkFg" - }, - "source": [ - "### Load Models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "oVa0kOamprgj" - }, - "outputs": [], - "source": [ - "import os\n", - "import torch\n", - "import time\n", - "import IPython\n", - "\n", - "from TTS.tts.utils.generic_utils import setup_model\n", - "from TTS.utils.io import load_config\n", - "from TTS.tts.utils.text.symbols import symbols, phonemes\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.tts.utils.synthesis import synthesis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "EY-sHVO8IFSH" - }, - "outputs": [], - "source": [ - "# runtime settings\n", - "use_cuda = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_1aIUp2FpxOQ" - }, - "outputs": [], - "source": [ - "# model paths\n", - "TTS_MODEL = \"data/tts_model.pth.tar\"\n", - "TTS_CONFIG = \"data/config.json\"\n", - "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n", - "VOCODER_CONFIG = \"data/config_vocoder.json\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CpgmdBVQplbv" - }, - "outputs": [], - "source": [ - "# load configs\n", - "TTS_CONFIG = load_config(TTS_CONFIG)\n", - "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 471 - }, - "colab_type": "code", - "id": "zmrQxiozIUVE", - "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49", - "tags": [] - }, - "outputs": [], - "source": [ - "# load the audio processor\n", - "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n", - "ap = AudioProcessor(**TTS_CONFIG.audio) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "colab_type": "code", - "id": "8fLoI4ipqMeS", - "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382", - "tags": [] - }, - "outputs": [], - "source": [ - "# LOAD TTS MODEL\n", - "# multi speaker \n", - "speaker_id = None\n", - "speakers = []\n", - "\n", - "# load the model\n", - "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n", - "\n", - "# load model state\n", - "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n", - "\n", - "# load the model\n", - "model.load_state_dict(cp['model'])\n", - "if use_cuda:\n", - " model.cuda()\n", - "model.eval()\n", - "\n", - "# set model stepsize\n", - "if 'r' in cp:\n", - " model.decoder.set_r(cp['r'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "id": "zKoq0GgzqzhQ", - "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb", - "tags": [] - }, - "outputs": [], - "source": [ - "from TTS.vocoder.utils.generic_utils import setup_generator\n", - "\n", - "# LOAD VOCODER MODEL\n", - "vocoder_model = setup_generator(VOCODER_CONFIG)\n", - "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n", - "vocoder_model.remove_weight_norm()\n", - "vocoder_model.inference_padding = 0\n", - "\n", - "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", - "if use_cuda:\n", - " vocoder_model.cuda()\n", - "vocoder_model.eval()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ws_YkPKsLgo-" - }, - "source": [ - "## Run Inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 134 - }, - "colab_type": "code", - "id": "FuWxZ9Ey5Puj", - "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91", - "tags": [] - }, - "outputs": [], - "source": [ - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}