diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb
deleted file mode 100644
index 1be93a82..00000000
--- a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb
+++ /dev/null
@@ -1,606 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "6LWsNd3_M3MP"
- },
- "source": [
- "# Mozilla TTS on CPU Real-Time Speech Synthesis "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "FAqrSIWgLyP0"
- },
- "source": [
- "We use Tacotron2 and MultiBand-Melgan models and Baker dataset (chinese mandarin).\n",
- "\n",
- "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 126K steps (3 days) with a single GPU.\n",
- "\n",
- "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
- "\n",
- "Note that both model performances can be improved with more training."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Ku-dA4DKoeXk"
- },
- "source": [
- "### Download Models"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Zlgi8fPdpRF0"
- },
- "source": [
- "### Define TTS function"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "ZksegYQepkFg"
- },
- "source": [
- "### Load Models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "oVa0kOamprgj"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import torch\n",
- "import IPython\n",
- "\n",
- "from TTS.utils.synthesizer import Synthesizer\n",
- "from TTS.utils.manage import ModelManager\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "EY-sHVO8IFSH"
- },
- "outputs": [],
- "source": [
- "# runtime settings\n",
- "use_cuda = False"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# tts and vocoder name\n",
- "TTS_NAME = \"tts_models/zh-CN/baker/tacotron2-DDC-GST\"\n",
- "VOCODER_NAME = \"vocoder_models/en/ljspeech/multiband-melgan\"\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "manager = ModelManager(\"../TTS/.models.json\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > tts_models/zh-CN/baker/tacotron2-DDC-GST is already downloaded.\n",
- " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n"
- ]
- }
- ],
- "source": [
- "tts_checkpoint_file, tts_config_file, tts_json_dict = manager.download_model(TTS_NAME)\n",
- "vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(VOCODER_NAME)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Using model: tacotron2\n",
- " > Generator Model: multiband_melgan_generator\n"
- ]
- }
- ],
- "source": [
- "synthesizer = Synthesizer(tts_checkpoint_file, tts_config_file, vocoder_checkpoint_file, vocoder_config_file, use_cuda)\n",
- "sample_rate = synthesizer.tts_config.audio[\"sample_rate\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Ws_YkPKsLgo-"
- },
- "source": [
- "## Run Inference"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Here some test sentences for you to play with :\n",
- "sentences= [\"我从来不会说很标准的中文。\",\n",
- "\"我喜欢听人工智能的博客。\",\n",
- "\"我来自一个法国郊区的地方。\",\n",
- "\"不比不知道,一比吓一跳!\",\n",
- "\"台湾是一个真的很好玩的地方!\",\n",
- "\"干一行,行一行,行行都行。\",\n",
- "\"我要盖被子,好尴尬!\",]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['我从来不会说很标准的中文。']\n",
- " > Processing time: 1.6665124893188477\n",
- " > Real-time factor: 0.5583910829911347\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['我喜欢听人工智能的博客。']\n",
- " > Processing time: 1.4052538871765137\n",
- " > Real-time factor: 0.5193391025114328\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['我来自一个法国郊区的地方。']\n",
- " > Processing time: 1.605910062789917\n",
- " > Real-time factor: 0.5785999490934259\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['不比不知道,一比吓一跳!']\n",
- " > Processing time: 1.9105627536773682\n",
- " > Real-time factor: 0.6607262973429417\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['台湾是一个真的很好玩的地方!']\n",
- " > Processing time: 1.3081049919128418\n",
- " > Real-time factor: 0.4218891158389621\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['干一行,行一行,行行都行。']\n",
- " > Processing time: 2.0958540439605713\n",
- " > Real-time factor: 0.6709288860239634\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['我要盖被子,好尴尬!']\n",
- " > Processing time: 1.5188167095184326\n",
- " > Real-time factor: 0.6257456734843319\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "for sentence in sentences:\n",
- " wav = synthesizer.tts(sentence)\n",
- " IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) \n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['我喜欢听人工智能的博客。']\n",
- " > Processing time: 2.114016056060791\n",
- " > Real-time factor: 0.643271887228699\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# you can also play with Global Style Token (GST) by feeding a \n",
- "# ... wav_style parameter to the tts method\n",
- "\n",
- "style_wav = {\"2\": 0.2}\n",
- "\n",
- "wav = synthesizer.tts(sentences[1], style_wav=style_wav)\n",
- "IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['我喜欢听人工智能的博客。']\n",
- " > Processing time: 1.5687272548675537\n",
- " > Real-time factor: 0.6401842606201799\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['我喜欢听人工智能的博客。']\n",
- " > Processing time: 2.070594072341919\n",
- " > Real-time factor: 0.8067677285683367\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['我喜欢听人工智能的博客。']\n",
- " > Processing time: 1.3769311904907227\n",
- " > Real-time factor: 0.5088718951180015\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['我喜欢听人工智能的博客。']\n",
- " > Processing time: 2.024374485015869\n",
- " > Real-time factor: 0.6782983435843654\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Text splitted to sentences.\n",
- "['我喜欢听人工智能的博客。']\n",
- " > Processing time: 2.4434399604797363\n",
- " > Real-time factor: 0.7435119663360867\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# On this model specifically, we can observe that the GSToken \"2\" is responsible for speech speed\n",
- "# You can listen to these 5 different samples, the flow is slower and slower as the value is higher\n",
- "for value in [-0.2, -0.1, 0, 0.1, 0.2]:\n",
- " style_wav = {\"2\": value}\n",
- " wav = synthesizer.tts(sentences[1], style_wav=style_wav)\n",
- " IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "accelerator": "GPU",
- "colab": {
- "collapsed_sections": [],
- "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
- "provenance": [],
- "toc_visible": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb b/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb
deleted file mode 100644
index 67171b0e..00000000
--- a/notebooks/DDC_TTS_and_MultiBand_MelGAN_Example.ipynb
+++ /dev/null
@@ -1,342 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "6LWsNd3_M3MP"
- },
- "source": [
- "# Mozilla TTS on CPU Real-Time Speech Synthesis "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "FAqrSIWgLyP0"
- },
- "source": [
- "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
- "\n",
- "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
- "\n",
- "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
- "\n",
- "Note that both model performances can be improved with more training."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Ku-dA4DKoeXk"
- },
- "source": [
- "### Download Models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 162
- },
- "colab_type": "code",
- "id": "jGIgnWhGsxU1",
- "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
- "tags": []
- },
- "outputs": [],
- "source": [
- "!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
- "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 235
- },
- "colab_type": "code",
- "id": "4dnpE0-kvTsu",
- "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
- "tags": []
- },
- "outputs": [],
- "source": [
- "!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n",
- "!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n",
- "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Zlgi8fPdpRF0"
- },
- "source": [
- "### Define TTS function"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "f-Yc42nQZG5A"
- },
- "outputs": [],
- "source": [
- "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
- " t_1 = time.time()\n",
- " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
- " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
- " # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
- " if not use_gl:\n",
- " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
- " waveform = waveform.flatten()\n",
- " if use_cuda:\n",
- " waveform = waveform.cpu()\n",
- " waveform = waveform.numpy()\n",
- " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
- " tps = (time.time() - t_1) / len(waveform)\n",
- " print(waveform.shape)\n",
- " print(\" > Run-time: {}\".format(time.time() - t_1))\n",
- " print(\" > Real-time factor: {}\".format(rtf))\n",
- " print(\" > Time per step: {}\".format(tps))\n",
- " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
- " return alignment, mel_postnet_spec, stop_tokens, waveform"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "ZksegYQepkFg"
- },
- "source": [
- "### Load Models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "oVa0kOamprgj"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import torch\n",
- "import time\n",
- "import IPython\n",
- "\n",
- "from TTS.tts.utils.generic_utils import setup_model\n",
- "from TTS.utils.io import load_config\n",
- "from TTS.tts.utils.text.symbols import symbols, phonemes\n",
- "from TTS.utils.audio import AudioProcessor\n",
- "from TTS.tts.utils.synthesis import synthesis"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "EY-sHVO8IFSH"
- },
- "outputs": [],
- "source": [
- "# runtime settings\n",
- "use_cuda = False"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "_1aIUp2FpxOQ"
- },
- "outputs": [],
- "source": [
- "# model paths\n",
- "TTS_MODEL = \"data/tts_model.pth.tar\"\n",
- "TTS_CONFIG = \"data/config.json\"\n",
- "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
- "VOCODER_CONFIG = \"data/config_vocoder.json\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "CpgmdBVQplbv"
- },
- "outputs": [],
- "source": [
- "# load configs\n",
- "TTS_CONFIG = load_config(TTS_CONFIG)\n",
- "VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 471
- },
- "colab_type": "code",
- "id": "zmrQxiozIUVE",
- "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
- "tags": []
- },
- "outputs": [],
- "source": [
- "# load the audio processor\n",
- "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
- "ap = AudioProcessor(**TTS_CONFIG.audio) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 35
- },
- "colab_type": "code",
- "id": "8fLoI4ipqMeS",
- "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
- "tags": []
- },
- "outputs": [],
- "source": [
- "# LOAD TTS MODEL\n",
- "# multi speaker \n",
- "speaker_id = None\n",
- "speakers = []\n",
- "\n",
- "# load the model\n",
- "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
- "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
- "\n",
- "# load model state\n",
- "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
- "\n",
- "# load the model\n",
- "model.load_state_dict(cp['model'])\n",
- "if use_cuda:\n",
- " model.cuda()\n",
- "model.eval()\n",
- "\n",
- "# set model stepsize\n",
- "if 'r' in cp:\n",
- " model.decoder.set_r(cp['r'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 1000
- },
- "colab_type": "code",
- "id": "zKoq0GgzqzhQ",
- "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
- "tags": []
- },
- "outputs": [],
- "source": [
- "from TTS.vocoder.utils.generic_utils import setup_generator\n",
- "\n",
- "# LOAD VOCODER MODEL\n",
- "vocoder_model = setup_generator(VOCODER_CONFIG)\n",
- "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
- "vocoder_model.remove_weight_norm()\n",
- "vocoder_model.inference_padding = 0\n",
- "\n",
- "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
- "if use_cuda:\n",
- " vocoder_model.cuda()\n",
- "vocoder_model.eval()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Ws_YkPKsLgo-"
- },
- "source": [
- "## Run Inference"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 134
- },
- "colab_type": "code",
- "id": "FuWxZ9Ey5Puj",
- "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
- "tags": []
- },
- "outputs": [],
- "source": [
- "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
- ]
- }
- ],
- "metadata": {
- "accelerator": "GPU",
- "colab": {
- "collapsed_sections": [],
- "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
- "provenance": [],
- "toc_visible": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/DDC_TTS_and_MultiBand_MelGAN_TF_Example.ipynb b/notebooks/DDC_TTS_and_MultiBand_MelGAN_TF_Example.ipynb
deleted file mode 100644
index 4b009ce9..00000000
--- a/notebooks/DDC_TTS_and_MultiBand_MelGAN_TF_Example.ipynb
+++ /dev/null
@@ -1,346 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "Collapsed": "false",
- "colab_type": "text",
- "id": "6LWsNd3_M3MP"
- },
- "source": [
- "# Mozilla TTS on CPU Real-Time Speech Synthesis with Tensorflow"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "Collapsed": "false",
- "colab_type": "text",
- "id": "FAqrSIWgLyP0"
- },
- "source": [
- "**These models are converted from released [PyTorch models](https://colab.research.google.com/drive/1u_16ZzHjKYFn1HNVuA4Qf_i2MMFB9olY?usp=sharing) using our TF utilities provided in Mozilla TTS.**\n",
- "\n",
- "These TF models support TF 2.2 and for different versions you might need to\n",
- "regenerate them. \n",
- "\n",
- "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
- "\n",
- "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
- "\n",
- "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
- "\n",
- "Note that both model performances can be improved with more training.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "Collapsed": "false",
- "colab_type": "text",
- "id": "Ku-dA4DKoeXk"
- },
- "source": [
- "### Download Models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 162
- },
- "colab_type": "code",
- "id": "jGIgnWhGsxU1",
- "outputId": "08b0dddd-4edf-48c9-e8e5-a419b36a5c3d",
- "tags": []
- },
- "outputs": [],
- "source": [
- "!gdown --id 1p7OSEEW_Z7ORxNgfZwhMy7IiLE1s0aH7 -O data/tts_model.pkl\n",
- "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 235
- },
- "colab_type": "code",
- "id": "4dnpE0-kvTsu",
- "outputId": "2fe836eb-c7e7-4f1e-9352-0142126bb19f",
- "tags": []
- },
- "outputs": [],
- "source": [
- "!gdown --id 1rHmj7CqD3Sfa716Y3ub_vpIBrQg_b1yF -O data/vocoder_model.pkl\n",
- "!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n",
- "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "Collapsed": "false",
- "colab_type": "text",
- "id": "Zlgi8fPdpRF0"
- },
- "source": [
- "### Define TTS function"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {},
- "colab_type": "code",
- "id": "f-Yc42nQZG5A"
- },
- "outputs": [],
- "source": [
- "def tts(model, text, CONFIG, p):\n",
- " t_1 = time.time()\n",
- " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
- " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n",
- " backend='tf')\n",
- " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
- " waveform = waveform.numpy()[0, 0]\n",
- " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
- " tps = (time.time() - t_1) / len(waveform)\n",
- " print(waveform.shape)\n",
- " print(\" > Run-time: {}\".format(time.time() - t_1))\n",
- " print(\" > Real-time factor: {}\".format(rtf))\n",
- " print(\" > Time per step: {}\".format(tps))\n",
- " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
- " return alignment, mel_postnet_spec, stop_tokens, waveform"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "Collapsed": "false",
- "colab_type": "text",
- "id": "ZksegYQepkFg"
- },
- "source": [
- "### Load Models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {},
- "colab_type": "code",
- "id": "oVa0kOamprgj"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import torch\n",
- "import time\n",
- "import IPython\n",
- "\n",
- "from TTS.tts.tf.utils.generic_utils import setup_model\n",
- "from TTS.tts.tf.utils.io import load_checkpoint\n",
- "from TTS.utils.io import load_config\n",
- "from TTS.tts.utils.text.symbols import symbols, phonemes\n",
- "from TTS.utils.audio import AudioProcessor\n",
- "from TTS.tts.utils.synthesis import synthesis"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {},
- "colab_type": "code",
- "id": "EY-sHVO8IFSH"
- },
- "outputs": [],
- "source": [
- "# runtime settings\n",
- "use_cuda = False"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {},
- "colab_type": "code",
- "id": "_1aIUp2FpxOQ"
- },
- "outputs": [],
- "source": [
- "# model paths\n",
- "TTS_MODEL = \"data/tts_model.pkl\"\n",
- "TTS_CONFIG = \"data/config.json\"\n",
- "VOCODER_MODEL = \"data/vocoder_model.pkl\"\n",
- "VOCODER_CONFIG = \"data/config_vocoder.json\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {},
- "colab_type": "code",
- "id": "CpgmdBVQplbv"
- },
- "outputs": [],
- "source": [
- "# load configs\n",
- "TTS_CONFIG = load_config(TTS_CONFIG)\n",
- "VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 471
- },
- "colab_type": "code",
- "id": "zmrQxiozIUVE",
- "outputId": "fa71bd05-401f-4e5b-a6f7-60ae765966db",
- "tags": []
- },
- "outputs": [],
- "source": [
- "# load the audio processor\n",
- "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
- "ap = AudioProcessor(**TTS_CONFIG.audio) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 72
- },
- "colab_type": "code",
- "id": "8fLoI4ipqMeS",
- "outputId": "595d990f-930d-4698-ee14-77796b5eed7d",
- "tags": []
- },
- "outputs": [],
- "source": [
- "# LOAD TTS MODEL\n",
- "# multi speaker \n",
- "speaker_id = None\n",
- "speakers = []\n",
- "\n",
- "# load the model\n",
- "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
- "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
- "model.build_inference()\n",
- "model = load_checkpoint(model, TTS_MODEL)\n",
- "model.decoder.set_max_decoder_steps(1000)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 489
- },
- "colab_type": "code",
- "id": "zKoq0GgzqzhQ",
- "outputId": "2cc3deae-144f-4465-da3b-98628d948506"
- },
- "outputs": [],
- "source": [
- "from TTS.vocoder.tf.utils.generic_utils import setup_generator\n",
- "from TTS.vocoder.tf.utils.io import load_checkpoint\n",
- "\n",
- "# LOAD VOCODER MODEL\n",
- "vocoder_model = setup_generator(VOCODER_CONFIG)\n",
- "vocoder_model.build_inference()\n",
- "vocoder_model = load_checkpoint(vocoder_model, VOCODER_MODEL)\n",
- "vocoder_model.inference_padding = 0\n",
- "\n",
- "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "Collapsed": "false",
- "colab_type": "text",
- "id": "Ws_YkPKsLgo-"
- },
- "source": [
- "## Run Inference"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "Collapsed": "false",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 134
- },
- "colab_type": "code",
- "id": "FuWxZ9Ey5Puj",
- "outputId": "07ede6e5-06e6-4612-f687-7984d20e5254"
- },
- "outputs": [],
- "source": [
- "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "collapsed_sections": [],
- "name": "DDC-TTS_and_MultiBand-MelGAN_TF_Example.ipynb",
- "provenance": [],
- "toc_visible": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb b/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb
deleted file mode 100644
index 4c1008e0..00000000
--- a/notebooks/DDC_TTS_and_ParallelWaveGAN_Example.ipynb
+++ /dev/null
@@ -1,342 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "6LWsNd3_M3MP"
- },
- "source": [
- "# Mozilla TTS on CPU Real-Time Speech Synthesis "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "FAqrSIWgLyP0"
- },
- "source": [
- "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
- "\n",
- "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
- "\n",
- "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
- "\n",
- "Note that both model performances can be improved with more training."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Ku-dA4DKoeXk"
- },
- "source": [
- "### Download Models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 162
- },
- "colab_type": "code",
- "id": "jGIgnWhGsxU1",
- "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
- "tags": []
- },
- "outputs": [],
- "source": [
- "!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
- "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 235
- },
- "colab_type": "code",
- "id": "4dnpE0-kvTsu",
- "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
- "tags": []
- },
- "outputs": [],
- "source": [
- "!gdown --id 1X09hHAyAJOnrplCUMAdW_t341Kor4YR4 -O data/vocoder_model.pth.tar\n",
- "!gdown --id \"1qN7vQRIYkzvOX_DtiZtTajzoZ1eW1-Eg\" -O data/config_vocoder.json\n",
- "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Zlgi8fPdpRF0"
- },
- "source": [
- "### Define TTS function"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "f-Yc42nQZG5A"
- },
- "outputs": [],
- "source": [
- "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
- " t_1 = time.time()\n",
- " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
- " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
- " # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
- " if not use_gl:\n",
- " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
- " waveform = waveform.flatten()\n",
- " if use_cuda:\n",
- " waveform = waveform.cpu()\n",
- " waveform = waveform.numpy()\n",
- " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
- " tps = (time.time() - t_1) / len(waveform)\n",
- " print(waveform.shape)\n",
- " print(\" > Run-time: {}\".format(time.time() - t_1))\n",
- " print(\" > Real-time factor: {}\".format(rtf))\n",
- " print(\" > Time per step: {}\".format(tps))\n",
- " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
- " return alignment, mel_postnet_spec, stop_tokens, waveform"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "ZksegYQepkFg"
- },
- "source": [
- "### Load Models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "oVa0kOamprgj"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import torch\n",
- "import time\n",
- "import IPython\n",
- "\n",
- "from TTS.tts.utils.generic_utils import setup_model\n",
- "from TTS.utils.io import load_config\n",
- "from TTS.tts.utils.text.symbols import symbols, phonemes\n",
- "from TTS.utils.audio import AudioProcessor\n",
- "from TTS.tts.utils.synthesis import synthesis"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "EY-sHVO8IFSH"
- },
- "outputs": [],
- "source": [
- "# runtime settings\n",
- "use_cuda = False"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "_1aIUp2FpxOQ"
- },
- "outputs": [],
- "source": [
- "# model paths\n",
- "TTS_MODEL = \"data/tts_model.pth.tar\"\n",
- "TTS_CONFIG = \"data/config.json\"\n",
- "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
- "VOCODER_CONFIG = \"data/config_vocoder.json\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "CpgmdBVQplbv"
- },
- "outputs": [],
- "source": [
- "# load configs\n",
- "TTS_CONFIG = load_config(TTS_CONFIG)\n",
- "VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 471
- },
- "colab_type": "code",
- "id": "zmrQxiozIUVE",
- "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
- "tags": []
- },
- "outputs": [],
- "source": [
- "# load the audio processor\n",
- "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
- "ap = AudioProcessor(**TTS_CONFIG.audio) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 35
- },
- "colab_type": "code",
- "id": "8fLoI4ipqMeS",
- "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
- "tags": []
- },
- "outputs": [],
- "source": [
- "# LOAD TTS MODEL\n",
- "# multi speaker \n",
- "speaker_id = None\n",
- "speakers = []\n",
- "\n",
- "# load the model\n",
- "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
- "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
- "\n",
- "# load model state\n",
- "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
- "\n",
- "# load the model\n",
- "model.load_state_dict(cp['model'])\n",
- "if use_cuda:\n",
- " model.cuda()\n",
- "model.eval()\n",
- "\n",
- "# set model stepsize\n",
- "if 'r' in cp:\n",
- " model.decoder.set_r(cp['r'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 1000
- },
- "colab_type": "code",
- "id": "zKoq0GgzqzhQ",
- "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
- "tags": []
- },
- "outputs": [],
- "source": [
- "from TTS.vocoder.utils.generic_utils import setup_generator\n",
- "\n",
- "# LOAD VOCODER MODEL\n",
- "vocoder_model = setup_generator(VOCODER_CONFIG)\n",
- "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
- "vocoder_model.remove_weight_norm()\n",
- "vocoder_model.inference_padding = 0\n",
- "\n",
- "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
- "if use_cuda:\n",
- " vocoder_model.cuda()\n",
- "vocoder_model.eval()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Ws_YkPKsLgo-"
- },
- "source": [
- "## Run Inference"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 134
- },
- "colab_type": "code",
- "id": "FuWxZ9Ey5Puj",
- "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
- "tags": []
- },
- "outputs": [],
- "source": [
- "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
- ]
- }
- ],
- "metadata": {
- "accelerator": "GPU",
- "colab": {
- "collapsed_sections": [],
- "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
- "provenance": [],
- "toc_visible": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}