TTS/notebooks/DDC_TTS_and_ParallelWaveGAN...

329 lines
9.7 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "6LWsNd3_M3MP",
"colab_type": "text"
},
"source": [
"# Mozilla TTS on CPU Real-Time Speech Synthesis "
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "FAqrSIWgLyP0",
"colab_type": "text"
},
"source": [
"We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
"\n",
"Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
"\n",
"MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
"\n",
"Note that both model performances can be improved with more training."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Ku-dA4DKoeXk",
"colab_type": "text"
},
"source": [
"### Download Models"
]
},
{
"cell_type": "code",
"metadata": {
"id": "jGIgnWhGsxU1",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 162
},
"outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
"tags": []
},
"source": [
"!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n",
"!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "4dnpE0-kvTsu",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 235
},
"outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
"tags": []
},
"source": [
"!gdown --id 1X09hHAyAJOnrplCUMAdW_t341Kor4YR4 -O data/vocoder_model.pth.tar\n",
"!gdown --id \"1qN7vQRIYkzvOX_DtiZtTajzoZ1eW1-Eg\" -O data/config_vocoder.json\n",
"!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Zlgi8fPdpRF0",
"colab_type": "text"
},
"source": [
"### Define TTS function"
]
},
{
"cell_type": "code",
"metadata": {
"id": "f-Yc42nQZG5A",
"colab_type": "code",
"colab": {}
},
"source": [
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
" t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n",
" truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
" # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
" waveform = waveform.flatten()\n",
" if use_cuda:\n",
" waveform = waveform.cpu()\n",
" waveform = waveform.numpy()\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" tps = (time.time() - t_1) / len(waveform)\n",
" print(waveform.shape)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" print(\" > Time per step: {}\".format(tps))\n",
" IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
" return alignment, mel_postnet_spec, stop_tokens, waveform"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZksegYQepkFg",
"colab_type": "text"
},
"source": [
"### Load Models"
]
},
{
"cell_type": "code",
"metadata": {
"id": "oVa0kOamprgj",
"colab_type": "code",
"colab": {}
},
"source": [
"import os\n",
"import torch\n",
"import time\n",
"import IPython\n",
"\n",
"from mozilla_voice_tts.tts.utils.generic_utils import setup_model\n",
"from mozilla_voice_tts.utils.io import load_config\n",
"from mozilla_voice_tts.tts.utils.text.symbols import symbols, phonemes\n",
"from mozilla_voice_tts.utils.audio import AudioProcessor\n",
"from mozilla_voice_tts.tts.utils.synthesis import synthesis"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "EY-sHVO8IFSH",
"colab_type": "code",
"colab": {}
},
"source": [
"# runtime settings\n",
"use_cuda = False"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "_1aIUp2FpxOQ",
"colab_type": "code",
"colab": {}
},
"source": [
"# model paths\n",
"TTS_MODEL = \"data/tts_model.pth.tar\"\n",
"TTS_CONFIG = \"data/config.json\"\n",
"VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
"VOCODER_CONFIG = \"data/config_vocoder.json\""
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "CpgmdBVQplbv",
"colab_type": "code",
"colab": {}
},
"source": [
"# load configs\n",
"TTS_CONFIG = load_config(TTS_CONFIG)\n",
"VOCODER_CONFIG = load_config(VOCODER_CONFIG)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "zmrQxiozIUVE",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 471
},
"outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
"tags": []
},
"source": [
"# load the audio processor\n",
"TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n",
"ap = AudioProcessor(**TTS_CONFIG.audio) "
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "8fLoI4ipqMeS",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
"tags": []
},
"source": [
"# LOAD TTS MODEL\n",
"# multi speaker \n",
"speaker_id = None\n",
"speakers = []\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
"\n",
"# load model state\n",
"cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
"\n",
"# load the model\n",
"model.load_state_dict(cp['model'])\n",
"if use_cuda:\n",
" model.cuda()\n",
"model.eval()\n",
"\n",
"# set model stepsize\n",
"if 'r' in cp:\n",
" model.decoder.set_r(cp['r'])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "zKoq0GgzqzhQ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
"tags": []
},
"source": [
"from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator\n",
"\n",
"# LOAD VOCODER MODEL\n",
"vocoder_model = setup_generator(VOCODER_CONFIG)\n",
"vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
"vocoder_model.remove_weight_norm()\n",
"vocoder_model.inference_padding = 0\n",
"\n",
"ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
"if use_cuda:\n",
" vocoder_model.cuda()\n",
"vocoder_model.eval()"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Ws_YkPKsLgo-",
"colab_type": "text"
},
"source": [
"## Run Inference"
]
},
{
"cell_type": "code",
"metadata": {
"id": "FuWxZ9Ey5Puj",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 134
},
"outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
"tags": []
},
"source": [
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)"
],
"execution_count": null,
"outputs": []
}
]
}