TTS/notebooks/CheckSpectrograms.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cd /home/erogol/projects/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from TTS.utils.audio import AudioProcessor\n",
    "from TTS.utils.visual import plot_spectrogram\n",
    "from TTS.utils.generic_utils import load_config\n",
    "import glob \n",
    "import IPython.display as ipd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config_path = \"/media/erogol/data_ssd/Data/models/tr/TTS-phoneme-January-14-2019_06+52PM-4ad64a7/config.json\"\n",
    "data_path = \"/home/erogol/Data/Mozilla/\"\n",
    "file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
    "CONFIG = load_config(config_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setup Audio Processor\n",
    "Play with the AP parameters until you find a good fit with the synthesis speech below. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "audio={\n",
    " 'audio_processor': 'audio',\n",
    " 'num_mels': 80,          # In general, you don'tneed to change it \n",
    " 'num_freq': 1025,        # In general, you don'tneed to change it \n",
    " 'sample_rate': 22050,    # It depends to the sample rate of the dataset.\n",
    " 'frame_length_ms': 50,   # In general, you don'tneed to change it \n",
    " 'frame_shift_ms': 12.5,  # In general, you don'tneed to change it \n",
    " 'preemphasis': 0.98,        # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
    " 'min_level_db': -100,\n",
    " 'ref_level_db': 20,      # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
    " 'power': 1.5,            # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
    " 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
    " 'signal_norm': True,     # This is more about your model. It does not give any change for the synthsis performance.\n",
    " 'symmetric_norm': False,   # Same as above\n",
    " 'max_norm': 1,           # Same as above\n",
    " 'clip_norm': True,       # Same as above\n",
    " 'mel_fmin': 0.0,        # You can play with this and check mel-spectrogram based voice synthesis below.\n",
    " 'mel_fmax': 8000.0,        # You can play with this and check mel-spectrogram based voice synthesis below.\n",
    " 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
    "\n",
    "AP = AudioProcessor(**audio);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check audio loading "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wav = AP.load_wav(file_paths[10])\n",
    "ipd.Audio(data=wav, rate=AP.sample_rate) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generate Mel-Spectrogram and Re-synthesis with GL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mel = AP.melspectrogram(wav)\n",
    "print(\"Max:\", mel.max())\n",
    "print(\"Min:\", mel.min())\n",
    "print(\"Mean:\", mel.mean())\n",
    "plot_spectrogram(mel.T, AP);\n",
    "\n",
    "wav_gen = AP.inv_mel_spectrogram(mel)\n",
    "ipd.Audio(wav_gen, rate=AP.sample_rate)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generate Linear-Spectrogram and Re-synthesis with GL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "spec = AP.spectrogram(wav)\n",
    "print(\"Max:\", spec.max())\n",
    "print(\"Min:\", spec.min())\n",
    "print(\"Mean:\", spec.mean())\n",
    "plot_spectrogram(spec.T, AP);\n",
    "\n",
    "wav_gen = AP.inv_spectrogram(spec)\n",
    "ipd.Audio(wav_gen, rate=AP.sample_rate)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compare values for a certain parameter\n",
    "\n",
    "Optimize your parameters by comparing different values per parameter at a time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "audio={\n",
    " 'audio_processor': 'audio',\n",
    " 'num_mels': 80,          # In general, you don'tneed to change it \n",
    " 'num_freq': 1025,        # In general, you don'tneed to change it \n",
    " 'sample_rate': 22050,    # It depends to the sample rate of the dataset.\n",
    " 'frame_length_ms': 50,   # In general, you don'tneed to change it \n",
    " 'frame_shift_ms': 12.5,  # In general, you don'tneed to change it \n",
    " 'preemphasis': 0.98,        # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
    " 'min_level_db': -100,\n",
    " 'ref_level_db': 20,      # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
    " 'power': 1.5,            # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
    " 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
    " 'signal_norm': True,     # This is more about your model. It does not give any change for the synthsis performance.\n",
    " 'symmetric_norm': False,   # Same as above\n",
    " 'max_norm': 1,           # Same as above\n",
    " 'clip_norm': True,       # Same as above\n",
    " 'mel_fmin': 0.0,        # You can play with this and check mel-spectrogram based voice synthesis below.\n",
    " 'mel_fmax': 8000.0,        # You can play with this and check mel-spectrogram based voice synthesis below.\n",
    " 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
    "\n",
    "AP = AudioProcessor(**audio);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from librosa import display\n",
    "from matplotlib import pylab as plt\n",
    "import IPython\n",
    "plt.rcParams['figure.figsize'] = (20.0, 16.0)\n",
    "\n",
    "def compare_values(attribute, values, file):\n",
    "    \"\"\"\n",
    "    attributes (str): the names of the attribute you like to test.\n",
    "    values (list): list of values to compare.\n",
    "    file (str): file name to perform the tests.\n",
    "    \"\"\"\n",
    "    wavs = []\n",
    "    for idx, val in enumerate(values):\n",
    "        set_val_cmd = \"AP.{}={}\".format(attribute, val)\n",
    "        exec(set_val_cmd)\n",
    "        wav = AP.load_wav(file)\n",
    "        spec = AP.spectrogram(wav)\n",
    "        spec_norm = AP._denormalize(spec.T)\n",
    "        plt.subplot(len(values), 2, 2*idx + 1)\n",
    "        plt.imshow(spec_norm.T, aspect=\"auto\", origin=\"lower\")\n",
    "        #         plt.colorbar()\n",
    "        plt.tight_layout()\n",
    "        wav_gen = AP.inv_spectrogram(spec)\n",
    "        wavs.append(wav_gen)\n",
    "        plt.subplot(len(values), 2, 2*idx + 2)\n",
    "        display.waveplot(wav, alpha=0.5)\n",
    "        display.waveplot(wav_gen, alpha=0.25)\n",
    "        plt.title(\"{}={}\".format(attribute, val))\n",
    "        plt.tight_layout()\n",
    "    \n",
    "    wav = AP.load_wav(file)\n",
    "    print(\" > Ground-truth\")\n",
    "    IPython.display.display(IPython.display.Audio(wav, rate=AP.sample_rate))\n",
    "    \n",
    "    for idx, wav_gen in enumerate(wavs):\n",
    "        val = values[idx]\n",
    "        print(\" > {} = {}\".format(attribute, val))\n",
    "        IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99], file_paths[10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "compare_values(\"ref_level_db\", [10, 15, 20, 25, 30, 35, 40], file_paths[10])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}