{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018.ipynb", "provenance": [], "collapsed_sections": [ "vnV-FigfvsS2", "hkvv7gRcx4WV", "QJ6VgT2a4vHW" ] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "yZK6UdwSFnOO", "colab_type": "text" }, "source": [ "# **Download and install Mozilla TTS**" ] }, { "cell_type": "code", "metadata": { "id": "yvb0pX3WY6MN", "colab_type": "code", "colab": {} }, "source": [ "import os \n", "!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "iB9nl2UEG3SY", "colab_type": "code", "colab": {} }, "source": [ "!apt-get install espeak\n", "os.chdir('TTS')\n", "!pip install -r requirements.txt\n", "!python setup.py develop\n", "os.chdir('..')" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "w6Krn8k1inC_", "colab_type": "text" }, "source": [ "\n", "\n", "**Download Checkpoint**\n", "\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "PiYHf3lKhi9z", "colab_type": "code", "colab": {} }, "source": [ "!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018.zip\n", "!unzip ./TTS-checkpoint.zip\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "MpYNgqrZcJKn", "colab_type": "text" }, "source": [ "**Utils Functions**" ] }, { "cell_type": "code", "metadata": { "id": "4KZA4b_CbMqx", "colab_type": "code", "colab": {} }, "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "import argparse\n", "import json\n", "# pylint: disable=redefined-outer-name, unused-argument\n", "import os\n", "import string\n", "import time\n", "import sys\n", "import numpy as np\n", "\n", "TTS_PATH = \"../content/TTS\"\n", "# add libraries into environment\n", "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", "\n", "import torch\n", "\n", "from TTS.tts.utils.generic_utils import setup_model\n", "from TTS.tts.utils.synthesis import synthesis\n", "from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n", "from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.io import load_config\n", "from TTS.vocoder.utils.generic_utils import setup_generator\n", "\n", "\n", "def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):\n", " t_1 = time.time()\n", " waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", " if not use_gl:\n", " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", " if use_cuda and not use_gl:\n", " waveform = waveform.cpu()\n", " if not use_gl:\n", " waveform = waveform.numpy()\n", " waveform = waveform.squeeze()\n", " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", " tps = (time.time() - t_1) / len(waveform)\n", " print(\" > Run-time: {}\".format(time.time() - t_1))\n", " print(\" > Real-time factor: {}\".format(rtf))\n", " print(\" > Time per step: {}\".format(tps))\n", " return waveform\n", "\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ENA2OumIVeMA", "colab_type": "text" }, "source": [ "# **Vars definitions**\n" ] }, { "cell_type": "code", "metadata": { "id": "jPD0d_XpVXmY", "colab_type": "code", "colab": {} }, "source": [ "TEXT = ''\n", "OUT_PATH = 'tests-audios/'\n", "# create output path\n", "os.makedirs(OUT_PATH, exist_ok=True)\n", "\n", "SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n", "\n", "# model vars \n", "MODEL_PATH = 'best_model.pth.tar'\n", "CONFIG_PATH = 'config.json'\n", "SPEAKER_JSON = 'speakers.json'\n", "\n", "# vocoder vars\n", "VOCODER_PATH = ''\n", "VOCODER_CONFIG_PATH = ''\n", "\n", "USE_CUDA = True" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "dV6cXXlfi72r", "colab_type": "text" }, "source": [ "# **Restore TTS Model**" ] }, { "cell_type": "code", "metadata": { "id": "x1WgLFauWUPe", "colab_type": "code", "colab": {} }, "source": [ "# load the config\n", "C = load_config(CONFIG_PATH)\n", "C.forward_attn_mask = True\n", "\n", "# load the audio processor\n", "ap = AudioProcessor(**C.audio)\n", "\n", "# if the vocabulary was passed, replace the default\n", "if 'characters' in C.keys():\n", " symbols, phonemes = make_symbols(**C.characters)\n", "\n", "speaker_embedding = None\n", "speaker_embedding_dim = None\n", "num_speakers = 0\n", "# load speakers\n", "if SPEAKER_JSON != '':\n", " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", " num_speakers = len(speaker_mapping)\n", " if C.use_external_speaker_embedding_file:\n", " if SPEAKER_FILEID is not None:\n", " speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n", " else: # if speaker_fileid is not specificated use the first sample in speakers.json\n", " choise_speaker = list(speaker_mapping.keys())[0]\n", " print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n", " speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n", " speaker_embedding_dim = len(speaker_embedding)\n", "\n", "# load the model\n", "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", "model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n", "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", "model.load_state_dict(cp['model'])\n", "model.eval()\n", "\n", "if USE_CUDA:\n", " model.cuda()\n", "\n", "model.decoder.set_r(cp['r'])\n", "\n", "# load vocoder model\n", "if VOCODER_PATH!= \"\":\n", " VC = load_config(VOCODER_CONFIG_PATH)\n", " vocoder_model = setup_generator(VC)\n", " vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n", " vocoder_model.remove_weight_norm()\n", " if USE_CUDA:\n", " vocoder_model.cuda()\n", " vocoder_model.eval()\n", "else:\n", " vocoder_model = None\n", " VC = None\n", "\n", "# synthesize voice\n", "use_griffin_lim = VOCODER_PATH== \"\"\n", "\n", "if not C.use_external_speaker_embedding_file:\n", " if SPEAKER_FILEID.isdigit():\n", " SPEAKER_FILEID = int(SPEAKER_FILEID)\n", " else:\n", " SPEAKER_FILEID = None\n", "else:\n", " SPEAKER_FILEID = None\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "tNvVEoE30qY6", "colab_type": "text" }, "source": [ "Synthesize sentence with Speaker\n", "\n", "> Stop running the cell to leave!\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "2o8fXkVSyXOa", "colab_type": "code", "colab": {} }, "source": [ "import IPython\n", "from IPython.display import Audio\n", "print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n", "while True:\n", " TEXT = input(\"Enter sentence: \")\n", " print(\" > Text: {}\".format(TEXT))\n", " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", " # save the results\n", " file_name = TEXT.replace(\" \", \"_\")\n", " file_name = file_name.translate(\n", " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", " out_path = os.path.join(OUT_PATH, file_name)\n", " print(\" > Saving output to {}\".format(out_path))\n", " ap.save_wav(wav, out_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "vnV-FigfvsS2", "colab_type": "text" }, "source": [ "# **Select Speaker**\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "RuCGOnJ_fgDV", "colab_type": "code", "colab": {} }, "source": [ "\n", "# VCTK speakers not seen in training (new speakers)\n", "VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n", "\n", "# VCTK speakers seen in training\n", "VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n", "\n", "\n", "num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "hkvv7gRcx4WV", "colab_type": "text" }, "source": [ "## **Example select a VCTK seen speaker in training**" ] }, { "cell_type": "code", "metadata": { "id": "BviNMI9UyCYz", "colab_type": "code", "colab": {} }, "source": [ "# get embedding\n", "Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n", "# load speakers\n", "if SPEAKER_JSON != '':\n", " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", " if C.use_external_speaker_embedding_file:\n", " speaker_embeddings = []\n", " for key in list(speaker_mapping.keys()):\n", " if Speaker_choise in key:\n", " if len(speaker_embeddings) < num_samples_speaker:\n", " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", " # takes the average of the embedings samples of the announcers\n", " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", " " ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "5e5_XnLsx3jg", "colab_type": "code", "colab": {} }, "source": [ "import IPython\n", "from IPython.display import Audio\n", "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n", "while True:\n", " TEXT = input(\"Enter sentence: \")\n", " print(\" > Text: {}\".format(TEXT))\n", " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", " # save the results\n", " file_name = TEXT.replace(\" \", \"_\")\n", " file_name = file_name.translate(\n", " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", " out_path = os.path.join(OUT_PATH, file_name)\n", " print(\" > Saving output to {}\".format(out_path))\n", " ap.save_wav(wav, out_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "QJ6VgT2a4vHW" }, "source": [ "## **Example select a VCTK not seen speaker in training (new Speakers)**\n", "\n", "\n", "> Fitting new Speakers :)\n", "\n" ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "SZS57ZK-4vHa", "colab": {} }, "source": [ "# get embedding\n", "Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n", "# load speakers\n", "if SPEAKER_JSON != '':\n", " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", " if C.use_external_speaker_embedding_file:\n", " speaker_embeddings = []\n", " for key in list(speaker_mapping.keys()):\n", " if Speaker_choise in key:\n", " if len(speaker_embeddings) < num_samples_speaker:\n", " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", " # takes the average of the embedings samples of the announcers\n", " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", " " ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "bbs85vzz4vHo", "colab": {} }, "source": [ "import IPython\n", "from IPython.display import Audio\n", "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", "while True:\n", " TEXT = input(\"Enter sentence: \")\n", " print(\" > Text: {}\".format(TEXT))\n", " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", " # save the results\n", " file_name = TEXT.replace(\" \", \"_\")\n", " file_name = file_name.translate(\n", " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", " out_path = os.path.join(OUT_PATH, file_name)\n", " print(\" > Saving output to {}\".format(out_path))\n", " ap.save_wav(wav, out_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "LEE6mQLh5Who" }, "source": [ "# **Example Synthesizing with your own voice :)**\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "La70gSB65nrs", "colab_type": "text" }, "source": [ " Download and load GE2E Speaker Encoder " ] }, { "cell_type": "code", "metadata": { "id": "r0IEFZ0B5vQg", "colab_type": "code", "colab": {} }, "source": [ "!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n", "!unzip ./SpeakerEncoder-checkpoint.zip" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "jEH8HCTh5mF6", "colab_type": "code", "colab": {} }, "source": [ "SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n", "SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n", "SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n", "USE_CUDA = True" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "tOwkfQqT6-Qo", "colab_type": "code", "colab": {} }, "source": [ "from TTS.utils.audio import AudioProcessor\n", "from TTS.speaker_encoder.model import SpeakerEncoder\n", "se_config = load_config(SE_CONFIG_PATH)\n", "se_ap = AudioProcessor(**se_config['audio'])\n", "\n", "se_model = SpeakerEncoder(**se_config.model)\n", "se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n", "se_model.eval()\n", "if USE_CUDA:\n", " se_model.cuda()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "0TLlbUFG8O36", "colab_type": "text" }, "source": [ "Upload a wav audio file in your voice.\n", "\n", "\n", "> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "_FWwHPjJ8NXl", "colab_type": "code", "colab": {} }, "source": [ "from google.colab import files\n", "file_list = files.upload()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "WWOf6sgbBbGY", "colab_type": "code", "colab": {} }, "source": [ "# extract embedding from wav files\n", "speaker_embeddings = []\n", "for name in file_list.keys():\n", " if '.wav' in name:\n", " mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n", " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", " if USE_CUDA:\n", " mel_spec = mel_spec.cuda()\n", " embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", " speaker_embeddings.append(embedd)\n", " else:\n", " print(\" You need upload Wav files, others files is not supported !!\")\n", "\n", "# takes the average of the embedings samples of the announcers\n", "speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "xmItcGac5WiG", "colab": {} }, "source": [ "import IPython\n", "from IPython.display import Audio\n", "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", "while True:\n", " TEXT = input(\"Enter sentence: \")\n", " print(\" > Text: {}\".format(TEXT))\n", " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", " # save the results\n", " file_name = TEXT.replace(\" \", \"_\")\n", " file_name = file_name.translate(\n", " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", " out_path = os.path.join(OUT_PATH, file_name)\n", " print(\" > Saving output to {}\".format(out_path))\n", " ap.save_wav(wav, out_path)" ], "execution_count": null, "outputs": [] } ] }