TTS/notebooks/ReadArticle.ipynb

163 lines
4.9 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os\n",
"import sys\n",
"import io\n",
"import torch \n",
"import time\n",
"import numpy as np\n",
"from collections import OrderedDict\n",
"from matplotlib import pylab as plt\n",
"\n",
"%pylab inline\n",
"rcParams[\"figure.figsize\"] = (16,5)\n",
"sys.path.append('/home/erogol/projects/')\n",
"\n",
"import librosa\n",
"import librosa.display\n",
"\n",
"from TTS.models.tacotron import Tacotron \n",
"from TTS.layers import *\n",
"from TTS.utils.data import *\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.generic_utils import load_config\n",
"from TTS.utils.text import text_to_sequence\n",
"\n",
"import IPython\n",
"from IPython.display import Audio\n",
"from utils import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ls /data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def tts(model, text, CONFIG, use_cuda, ap, figures=True):\n",
" waveform, alignment, spectrogram, stop_tokens = create_speech(model, text, CONFIG, use_cuda, ap) \n",
" return waveform\n",
"\n",
"def text2audio(text, model, CONFIG, use_cuda, ap):\n",
" wavs = []\n",
" for sen in text.split('.'):\n",
" if len(sen) < 3:\n",
" continue\n",
" sen+='.'\n",
" sen = sen.strip()\n",
" print(sen)\n",
" wav = tts(model, sen, CONFIG, use_cuda, ap)\n",
" wavs.append(wav)\n",
" wavs.append(np.zeros(10000))\n",
"# audio = np.stack(wavs)\n",
"# IPython.display.display(Audio(audio, rate=CONFIG.sample_rate)) \n",
" return wavs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set constants\n",
"ROOT_PATH = '/data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7'\n",
"MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = ROOT_PATH + '/test/'\n",
"CONFIG = load_config(CONFIG_PATH)\n",
"use_cuda = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# check_idxs = [50008, 100016, 200032, 266208]\n",
"check_idxs = [274480]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load the model\n",
"model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r)\n",
"\n",
"# load the audio processor\n",
"\n",
"ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,\n",
" CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,\n",
" CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=30) \n",
"\n",
"\n",
"for idx in check_idxs:\n",
" MODEL_PATH = MODEL_PATH_TMP.format(idx)\n",
" print(MODEL_PATH)\n",
" \n",
" # load model state\n",
" if use_cuda:\n",
" cp = torch.load(MODEL_PATH)\n",
" else:\n",
" cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n",
"\n",
" # load the model\n",
" model.load_state_dict(cp['model'])\n",
" if use_cuda:\n",
" model.cuda()\n",
" model.eval()\n",
"\n",
" model.decoder.max_decoder_steps = 400\n",
" text = \"Voice is natural, voice is human. Thats why we are fascinated with creating usable voice technology for our machines. But to create voice systems, an extremely large amount of voice data is required. Most of the data used by large companies isnt available to the majority of people. We think that stifles innovation. So weve launched Project Common Voice, a project to help make voice recognition open to everyone.\"\n",
" wavs = text2audio(text, model, CONFIG, use_cuda, ap)\n",
"\n",
" audio = np.concatenate(wavs)\n",
" IPython.display.display(Audio(audio, rate=CONFIG.sample_rate)) \n",
" ap.save_wav(audio, 'benchmark_samples/CommonVoice.wav')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}