TTS/notebooks/Benchmark.ipynb

381 lines
10 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os\n",
"import sys\n",
"import io\n",
"import torch \n",
"import time\n",
"import numpy as np\n",
"from collections import OrderedDict\n",
"from matplotlib import pylab as plt\n",
"\n",
"%pylab inline\n",
"rcParams[\"figure.figsize\"] = (16,5)\n",
"sys.path.append('/home/erogol/projects/')\n",
"\n",
"import librosa\n",
"import librosa.display\n",
"\n",
"from TTS.models.tacotron import Tacotron \n",
"from TTS.layers import *\n",
"from TTS.utils.data import *\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.generic_utils import load_config\n",
"from TTS.utils.text import text_to_sequence\n",
"\n",
"import IPython\n",
"from IPython.display import Audio\n",
"from utils import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def tts(model, text, CONFIG, use_cuda, ap, figures=True):\n",
" t_1 = time.time()\n",
" waveform, alignment, spectrogram, stop_tokens = create_speech(model, text, CONFIG, use_cuda, ap) \n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" if figures: \n",
" visualize(alignment, spectrogram, stop_tokens, CONFIG) \n",
" IPython.display.display(Audio(waveform, rate=CONFIG.sample_rate)) \n",
" out_path = 'benchmark_samples/'\n",
" os.makedirs(out_path, exist_ok=True)\n",
" file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
" out_path = os.path.join(out_path, file_name)\n",
" ap.save_wav(waveform, out_path)\n",
" return alignment, spectrogram, stop_tokens"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set constants\n",
"ROOT_PATH = '/data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7/'\n",
"MODEL_PATH = ROOT_PATH + '/checkpoint_272976.pth.tar'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = ROOT_PATH + '/test/'\n",
"CONFIG = load_config(CONFIG_PATH)\n",
"use_cuda = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load the model\n",
"model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r)\n",
"\n",
"# load the audio processor\n",
"\n",
"ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,\n",
" CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,\n",
" CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=30) \n",
"\n",
"\n",
"# load model state\n",
"if use_cuda:\n",
" cp = torch.load(MODEL_PATH)\n",
"else:\n",
" cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n",
"\n",
"# load the model\n",
"model.load_state_dict(cp['model'])\n",
"if use_cuda:\n",
" model.cuda()\n",
"model.eval()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### EXAMPLES FROM TRAINING SET"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('/data/shared/KeithIto/LJSpeech-1.0/metadata_val.csv', delimiter='|')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"sentence = df.iloc[175, 1]\n",
"print(sentence)\n",
"model.decoder.max_decoder_steps = 250\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Comparision with https://mycroft.ai/blog/available-voices/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"sentence = \"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.\"\n",
"model.decoder.max_decoder_steps = 250\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Be a voice,not an echo.\" # 'echo' is not in training set. \n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"This cake is great. It's so delicious and moist.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Comparison with https://keithito.github.io/audio-samples/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Heres a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"He has read the whole thing.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"He reads books.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Thisss isrealy awhsome.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"This is your internet browser, Firefox.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"This is your internet browser Firefox.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
"align, spec, stop_tokens = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!zip benchmark_samples/samples.zip benchmark_samples/*"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}