{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook reads a given article by giving each sentence individually to the network without any state passing. You can also compare different checkpoints below." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/erogol/miniconda3/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['plt']\n", "`%matplotlib` prevents importing * from pylab and numpy\n", " \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n" ] } ], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "import os\n", "import sys\n", "import io\n", "import torch \n", "import time\n", "import numpy as np\n", "from collections import OrderedDict\n", "from matplotlib import pylab as plt\n", "\n", "%pylab inline\n", "rcParams[\"figure.figsize\"] = (16,5)\n", "sys.path.append('/home/erogol/Projects/') # change here if you don't install TTS by setup.py\n", "\n", "import librosa\n", "import librosa.display\n", "\n", "from TTS.models.tacotron import Tacotron \n", "from TTS.layers import *\n", "from TTS.utils.data import *\n", "from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.generic_utils import load_config\n", "from TTS.utils.text import text_to_sequence\n", "\n", "import IPython\n", "from IPython.display import Audio\n", "from synthesis import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def tts(model, text, CONFIG, use_cuda, ap, figures=True):\n", " waveform, alignment, spectrogram, stop_tokens = create_speech(model, text, CONFIG, use_cuda, ap) \n", " return waveform\n", "\n", "def text2audio(text, model, CONFIG, use_cuda, ap):\n", " wavs = []\n", " for sen in text.split('.'):\n", " if len(sen) < 3:\n", " continue\n", " sen+='.'\n", " sen = sen.strip()\n", " print(sen)\n", " wav = tts(model, sen, CONFIG, use_cuda, ap)\n", " wavs.append(wav)\n", " wavs.append(np.zeros(10000))\n", "# audio = np.stack(wavs)\n", "# IPython.display.display(Audio(audio, rate=CONFIG.sample_rate)) \n", " return wavs" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Set constants\n", "ROOT_PATH = '/home/erogol/Projects/runs/local_runs/September-26-2018_06+55PM-TTS-attn-smoothing-bgs-sigmoid-wd-231607a/'\n", "MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar'\n", "# MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", "OUT_FOLDER = ROOT_PATH + '/test'\n", "CONFIG = load_config(CONFIG_PATH)\n", "use_cuda = True" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Try different checkpoints\n", "check_idxs = [150000, 200000, 250000, 300000, 350000, 400000]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " | > Number of characters : 149\n", " > Setting up Audio Processor...\n", " | > fft size: 2048, hop length: 275, win length: 1100\n", "/home/erogol/Projects/runs/local_runs/September-26-2018_06+55PM-TTS-attn-smoothing-bgs-sigmoid-wd-231607a//best_model.pth.tar\n", "Voice is natural, voice is human.\n", "That’s why we are fascinated with creating usable voice technology for our machines.\n", "But to create voice systems, an extremely large amount of voice data is required.\n", "Most of the data used by large companies isn’t available to the majority of people.\n", "We think that stifles innovation.\n", "So we’ve launched Project Common Voice, a project to help make voice recognition open to everyone.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# load the model\n", "model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r)\n", "\n", "# load the audio processor\n", "\n", "ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,\n", " CONFIG.frame_shift_ms, CONFIG.frame_length_ms,\n", " CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, CONFIG.preemphasis,\n", " 60) \n", "\n", "for idx in check_idxs:\n", " MODEL_PATH = MODEL_PATH_TMP.format(idx)\n", " print(MODEL_PATH)\n", " \n", " # load model state\n", " if use_cuda:\n", " cp = torch.load(MODEL_PATH)\n", " else:\n", " cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n", "\n", " # load the model\n", " model.load_state_dict(cp['model'])\n", " if use_cuda:\n", " model.cuda()\n", " model.eval()\n", "\n", " model.decoder.max_decoder_steps = 400\n", " text = \"Voice is natural, voice is human. That’s why we are fascinated with creating usable voice technology for our machines. But to create voice systems, an extremely large amount of voice data is required. Most of the data used by large companies isn’t available to the majority of people. We think that stifles innovation. So we’ve launched Project Common Voice, a project to help make voice recognition open to everyone.\"\n", "# text = \"Does the quick brown fox jump over the lazy dog?\"\n", " wavs = text2audio(text, model, CONFIG, use_cuda, ap)\n", "\n", " audio = np.concatenate(wavs)\n", " IPython.display.display(Audio(audio, rate=CONFIG.sample_rate)) \n", " ap.save_wav(audio, 'benchmark_samples/CommonVoice.wav')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }