TTS/notebooks/ReadArticle.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "import os\n",
    "import sys\n",
    "import io\n",
    "import torch \n",
    "import time\n",
    "import numpy as np\n",
    "from collections import OrderedDict\n",
    "from matplotlib import pylab as plt\n",
    "\n",
    "%pylab inline\n",
    "rcParams[\"figure.figsize\"] = (16,5)\n",
    "sys.path.append('/home/erogol/projects/')\n",
    "\n",
    "import librosa\n",
    "import librosa.display\n",
    "\n",
    "from TTS.models.tacotron import Tacotron \n",
    "from TTS.layers import *\n",
    "from TTS.utils.data import *\n",
    "from TTS.utils.audio import AudioProcessor\n",
    "from TTS.utils.generic_utils import load_config\n",
    "from TTS.utils.text import text_to_sequence\n",
    "\n",
    "import IPython\n",
    "from IPython.display import Audio\n",
    "from utils import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ls /data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tts(model, text, CONFIG, use_cuda, ap, figures=True):\n",
    "    waveform, alignment, spectrogram, stop_tokens = create_speech(model, text, CONFIG, use_cuda, ap) \n",
    "    return waveform\n",
    "\n",
    "def text2audio(text, model, CONFIG, use_cuda, ap):\n",
    "    wavs = []\n",
    "    for sen in text.split('.'):\n",
    "        if len(sen) < 3:\n",
    "            continue\n",
    "        sen+='.'\n",
    "        sen = sen.strip()\n",
    "        print(sen)\n",
    "        wav = tts(model, sen, CONFIG, use_cuda, ap)\n",
    "        wavs.append(wav)\n",
    "        wavs.append(np.zeros(10000))\n",
    "#     audio = np.stack(wavs)\n",
    "#     IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  \n",
    "    return wavs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set constants\n",
    "ROOT_PATH = '/data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7'\n",
    "MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar'\n",
    "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
    "OUT_FOLDER = ROOT_PATH + '/test/'\n",
    "CONFIG = load_config(CONFIG_PATH)\n",
    "use_cuda = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check_idxs = [50008, 100016, 200032, 266208]\n",
    "check_idxs = [274480]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the model\n",
    "model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r)\n",
    "\n",
    "# load the audio processor\n",
    "\n",
    "ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,\n",
    "                    CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,\n",
    "                    CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=30)         \n",
    "\n",
    "\n",
    "for idx in check_idxs:\n",
    "    MODEL_PATH = MODEL_PATH_TMP.format(idx)\n",
    "    print(MODEL_PATH)\n",
    "    \n",
    "    # load model state\n",
    "    if use_cuda:\n",
    "        cp = torch.load(MODEL_PATH)\n",
    "    else:\n",
    "        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n",
    "\n",
    "    # load the model\n",
    "    model.load_state_dict(cp['model'])\n",
    "    if use_cuda:\n",
    "        model.cuda()\n",
    "    model.eval()\n",
    "\n",
    "    model.decoder.max_decoder_steps = 400\n",
    "    text = \"Voice is natural, voice is human. That’s why we are fascinated with creating usable voice technology for our machines. But to create voice systems, an extremely large amount of voice data is required. Most of the data used by large companies isn’t available to the majority of people. We think that stifles innovation. So we’ve launched Project Common Voice, a project to help make voice recognition open to everyone.\"\n",
    "    wavs = text2audio(text, model, CONFIG, use_cuda, ap)\n",
    "\n",
    "    audio = np.concatenate(wavs)\n",
    "    IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  \n",
    "    ap.save_wav(audio, 'benchmark_samples/CommonVoice.wav')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}