From 49484f093786aea98319715b598ed80eb6bb89e5 Mon Sep 17 00:00:00 2001
From: Guy Elsmore-Paddock <guy@inveniem.com>
Date: Sat, 20 Mar 2021 21:30:48 -0400
Subject: [PATCH] Clean-up `CheckSpectrograms` Notebook

- Fixes how parameters from the config are loaded while still making it
  possible for them to be overridden on the fly.
- Prints the list of audio files by index.
- Defines a constant to control which audio files gets loaded for all
  sections.
- Eliminates having to load the audio processor twice.
- Removes output (since it isn't relevant).
---
 .../dataset_analysis/CheckSpectrograms.ipynb  | 166 +++++++++---------
 1 file changed, 79 insertions(+), 87 deletions(-)

diff --git a/notebooks/dataset_analysis/CheckSpectrograms.ipynb b/notebooks/dataset_analysis/CheckSpectrograms.ipynb
index e16d885f..1ca580e1 100644
--- a/notebooks/dataset_analysis/CheckSpectrograms.ipynb
+++ b/notebooks/dataset_analysis/CheckSpectrograms.ipynb
@@ -2,11 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "Collapsed": "false"
    },
-   "outputs": [],
    "source": [
     "%matplotlib inline\n",
     "\n",
@@ -16,21 +14,32 @@
     "\n",
     "import IPython.display as ipd\n",
     "import glob"
-   ]
+   ],
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "Collapsed": "false"
    },
-   "outputs": [],
    "source": [
     "config_path = \"/home/erogol/Projects/TTS/tts/tts/config_thorsten_de.json\"\n",
     "data_path = \"/home/erogol/Data/thorsten-german/\"\n",
+    "\n",
     "file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
-    "CONFIG = load_config(config_path)"
-   ]
+    "CONFIG = load_config(config_path)\n",
+    "\n",
+    "# Change this to the index of the desired file listed below\n",
+    "sample_file_index = 10\n",
+    "\n",
+    "SAMPLE_FILE_PATH = file_paths[sample_file_index]\n",
+    "\n",
+    "print(\"File list, by index:\")\n",
+    "dict(enumerate(file_paths))"
+   ],
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -39,39 +48,54 @@
    },
    "source": [
     "### Setup Audio Processor\n",
-    "Play with the AP parameters until you find a good fit with the synthesis speech below. "
+    "Play with the AP parameters until you find a good fit with the synthesis speech below.\n",
+    "\n",
+    "The default values are loaded from your config.json file, so you only need to\n",
+    "uncomment and modify values below that you'd like to tune."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "Collapsed": "false"
    },
-   "outputs": [],
    "source": [
-    "# audio={\n",
+    "tune_params={\n",
     "#  'audio_processor': 'audio',\n",
-    "#  'num_mels': 80,          # In general, you don'tneed to change it \n",
-    "#  'fft_size': 1024,        # In general, you don'tneed to change it \n",
-    "#  'sample_rate': 22050,    # It depends to the sample rate of the dataset.\n",
-    "#  'hop_length': 256,   # In general, you don'tneed to change it \n",
-    "#  'win_length': 1024,  # In general, you don'tneed to change it \n",
-    "#  'preemphasis': 0.98,        # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
+    "#  'num_mels': 80,          # In general, you don't need to change this. \n",
+    "#  'fft_size': 1024,        # In general, you don't need to change this.\n",
+    "#  'sample_rate': 22050,    # This must match the sample rate of the dataset.\n",
+    "#  'hop_length': 256,       # In general, you don't need to change this.\n",
+    "#  'win_length': 1024,      # In general, you don't need to change this.\n",
+    "#  'preemphasis': 0.98,     # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
     "#  'min_level_db': -100,\n",
-    "#  'ref_level_db': 20,      # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
-    "#  'power': 1.5,            # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
-    "#  'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
-    "#  'signal_norm': True,     # This is more about your model. It does not give any change for the synthsis performance.\n",
-    "#  'symmetric_norm': False,   # Same as above\n",
-    "#  'max_norm': 1,           # Same as above\n",
-    "#  'clip_norm': True,       # Same as above\n",
-    "#  'mel_fmin': 0.0,        # You can play with this and check mel-spectrogram based voice synthesis below.\n",
-    "#  'mel_fmax': 8000.0,        # You can play with this and check mel-spectrogram based voice synthesis below.\n",
-    "#  'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
+    "#  'ref_level_db': 0,       # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
+    "#  'power': 1.5,            # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
+    "#  'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
+    "#  'mel_fmin': 0.0,         # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
+    "#  'mel_fmax': 8000.0,      # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
+    "#  'do_trim_silence': True  # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
+    "}\n",
     "\n",
-    "AP = AudioProcessor(**CONFIG.audio);"
-   ]
+    "# These options have to be forced off in order to avoid errors about the \n",
+    "# pre-calculated not matching the options being tuned.\n",
+    "reset={\n",
+    " 'signal_norm': False,\n",
+    " 'stats_path': None,\n",
+    " 'symmetric_norm': False,\n",
+    " 'max_norm': 1,\n",
+    " 'clip_norm': True,\n",
+    "}\n",
+    "\n",
+    "# Override select parts of loaded config with parameters above\n",
+    "tuned_config = CONFIG.audio.copy()\n",
+    "tuned_config.update(reset)\n",
+    "tuned_config.update(tune_params)\n",
+    "\n",
+    "AP = AudioProcessor(**tuned_config);"
+   ],
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -84,15 +108,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "Collapsed": "false"
    },
-   "outputs": [],
    "source": [
-    "wav = AP.load_wav(file_paths[10])\n",
+    "wav = AP.load_wav(SAMPLE_FILE_PATH)\n",
     "ipd.Audio(data=wav, rate=AP.sample_rate) "
-   ]
+   ],
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -105,20 +129,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "AP.power = 1.0"
-   ]
+   ],
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "Collapsed": "false"
    },
-   "outputs": [],
    "source": [
     "mel = AP.melspectrogram(wav)\n",
     "print(\"Max:\", mel.max())\n",
@@ -128,7 +149,9 @@
     "\n",
     "wav_gen = AP.inv_melspectrogram(mel)\n",
     "ipd.Audio(wav_gen, rate=AP.sample_rate)"
-   ]
+   ],
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -141,11 +164,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "Collapsed": "false"
    },
-   "outputs": [],
    "source": [
     "spec = AP.spectrogram(wav)\n",
     "print(\"Max:\", spec.max())\n",
@@ -155,7 +176,9 @@
     "\n",
     "wav_gen = AP.inv_spectrogram(spec)\n",
     "ipd.Audio(wav_gen, rate=AP.sample_rate)"
-   ]
+   ],
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -170,54 +193,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "Collapsed": "false"
    },
-   "outputs": [],
-   "source": [
-    "audio={\n",
-    " 'audio_processor': 'audio',\n",
-    " 'num_mels': 80,          # In general, you don'tneed to change it \n",
-    " 'num_freq': 1025,        # In general, you don'tneed to change it \n",
-    " 'sample_rate': 22050,    # It depends to the sample rate of the dataset.\n",
-    " 'frame_length_ms': 50,   # In general, you don'tneed to change it \n",
-    " 'frame_shift_ms': 12.5,  # In general, you don'tneed to change it \n",
-    " 'preemphasis': 0.98,        # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
-    " 'min_level_db': -100,\n",
-    " 'ref_level_db': 20,      # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
-    " 'power': 1.5,            # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
-    " 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
-    " 'signal_norm': True,     # This is more about your model. It does not give any change for the synthsis performance.\n",
-    " 'symmetric_norm': False,   # Same as above\n",
-    " 'max_norm': 1,           # Same as above\n",
-    " 'clip_norm': True,       # Same as above\n",
-    " 'mel_fmin': 0.0,        # You can play with this and check mel-spectrogram based voice synthesis below.\n",
-    " 'mel_fmax': 8000.0,        # You can play with this and check mel-spectrogram based voice synthesis below.\n",
-    " 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
-    "\n",
-    "AP = AudioProcessor(**audio);"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "Collapsed": "false"
-   },
-   "outputs": [],
    "source": [
     "from librosa import display\n",
     "from matplotlib import pylab as plt\n",
     "import IPython\n",
     "plt.rcParams['figure.figsize'] = (20.0, 16.0)\n",
     "\n",
-    "def compare_values(attribute, values, file):\n",
+    "def compare_values(attribute, values):\n",
     "    \"\"\"\n",
     "    attributes (str): the names of the attribute you like to test.\n",
     "    values (list): list of values to compare.\n",
-    "    file (str): file name to perform the tests.\n",
     "    \"\"\"\n",
+    "    file = SAMPLE_FILE_PATH\n",
     "    wavs = []\n",
     "    for idx, val in enumerate(values):\n",
     "        set_val_cmd = \"AP.{}={}\".format(attribute, val)\n",
@@ -245,29 +235,31 @@
     "        val = values[idx]\n",
     "        print(\" > {} = {}\".format(attribute, val))\n",
     "        IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
-   ]
+   ],
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "Collapsed": "false"
    },
-   "outputs": [],
    "source": [
-    "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99], file_paths[10])"
-   ]
+    "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
+   ],
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {
     "Collapsed": "false"
    },
-   "outputs": [],
    "source": [
-    "compare_values(\"ref_level_db\", [10, 15, 20, 25, 30, 35, 40], file_paths[10])"
-   ]
+    "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 100])"
+   ],
+   "execution_count": null,
+   "outputs": []
   }
  ],
  "metadata": {