From 49484f093786aea98319715b598ed80eb6bb89e5 Mon Sep 17 00:00:00 2001 From: Guy Elsmore-Paddock Date: Sat, 20 Mar 2021 21:30:48 -0400 Subject: [PATCH] Clean-up `CheckSpectrograms` Notebook - Fixes how parameters from the config are loaded while still making it possible for them to be overridden on the fly. - Prints the list of audio files by index. - Defines a constant to control which audio files gets loaded for all sections. - Eliminates having to load the audio processor twice. - Removes output (since it isn't relevant). --- .../dataset_analysis/CheckSpectrograms.ipynb | 166 +++++++++--------- 1 file changed, 79 insertions(+), 87 deletions(-) diff --git a/notebooks/dataset_analysis/CheckSpectrograms.ipynb b/notebooks/dataset_analysis/CheckSpectrograms.ipynb index e16d885f..1ca580e1 100644 --- a/notebooks/dataset_analysis/CheckSpectrograms.ipynb +++ b/notebooks/dataset_analysis/CheckSpectrograms.ipynb @@ -2,11 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": null, "metadata": { "Collapsed": "false" }, - "outputs": [], "source": [ "%matplotlib inline\n", "\n", @@ -16,21 +14,32 @@ "\n", "import IPython.display as ipd\n", "import glob" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": null, "metadata": { "Collapsed": "false" }, - "outputs": [], "source": [ "config_path = \"/home/erogol/Projects/TTS/tts/tts/config_thorsten_de.json\"\n", "data_path = \"/home/erogol/Data/thorsten-german/\"\n", + "\n", "file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n", - "CONFIG = load_config(config_path)" - ] + "CONFIG = load_config(config_path)\n", + "\n", + "# Change this to the index of the desired file listed below\n", + "sample_file_index = 10\n", + "\n", + "SAMPLE_FILE_PATH = file_paths[sample_file_index]\n", + "\n", + "print(\"File list, by index:\")\n", + "dict(enumerate(file_paths))" + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -39,39 +48,54 @@ }, "source": [ "### Setup Audio Processor\n", - "Play with the AP parameters until you find a good fit with the synthesis speech below. " + "Play with the AP parameters until you find a good fit with the synthesis speech below.\n", + "\n", + "The default values are loaded from your config.json file, so you only need to\n", + "uncomment and modify values below that you'd like to tune." ] }, { "cell_type": "code", - "execution_count": null, "metadata": { "Collapsed": "false" }, - "outputs": [], "source": [ - "# audio={\n", + "tune_params={\n", "# 'audio_processor': 'audio',\n", - "# 'num_mels': 80, # In general, you don'tneed to change it \n", - "# 'fft_size': 1024, # In general, you don'tneed to change it \n", - "# 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n", - "# 'hop_length': 256, # In general, you don'tneed to change it \n", - "# 'win_length': 1024, # In general, you don'tneed to change it \n", - "# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n", + "# 'num_mels': 80, # In general, you don't need to change this. \n", + "# 'fft_size': 1024, # In general, you don't need to change this.\n", + "# 'sample_rate': 22050, # This must match the sample rate of the dataset.\n", + "# 'hop_length': 256, # In general, you don't need to change this.\n", + "# 'win_length': 1024, # In general, you don't need to change this.\n", + "# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n", "# 'min_level_db': -100,\n", - "# 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n", - "# 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n", - "# 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n", - "# 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n", - "# 'symmetric_norm': False, # Same as above\n", - "# 'max_norm': 1, # Same as above\n", - "# 'clip_norm': True, # Same as above\n", - "# 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", - "# 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", - "# 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", + "# 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n", + "# 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n", + "# 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n", + "# 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n", + "# 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n", + "# 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", + "}\n", "\n", - "AP = AudioProcessor(**CONFIG.audio);" - ] + "# These options have to be forced off in order to avoid errors about the \n", + "# pre-calculated not matching the options being tuned.\n", + "reset={\n", + " 'signal_norm': False,\n", + " 'stats_path': None,\n", + " 'symmetric_norm': False,\n", + " 'max_norm': 1,\n", + " 'clip_norm': True,\n", + "}\n", + "\n", + "# Override select parts of loaded config with parameters above\n", + "tuned_config = CONFIG.audio.copy()\n", + "tuned_config.update(reset)\n", + "tuned_config.update(tune_params)\n", + "\n", + "AP = AudioProcessor(**tuned_config);" + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -84,15 +108,15 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "Collapsed": "false" }, - "outputs": [], "source": [ - "wav = AP.load_wav(file_paths[10])\n", + "wav = AP.load_wav(SAMPLE_FILE_PATH)\n", "ipd.Audio(data=wav, rate=AP.sample_rate) " - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -105,20 +129,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "AP.power = 1.0" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": null, "metadata": { "Collapsed": "false" }, - "outputs": [], "source": [ "mel = AP.melspectrogram(wav)\n", "print(\"Max:\", mel.max())\n", @@ -128,7 +149,9 @@ "\n", "wav_gen = AP.inv_melspectrogram(mel)\n", "ipd.Audio(wav_gen, rate=AP.sample_rate)" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -141,11 +164,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "Collapsed": "false" }, - "outputs": [], "source": [ "spec = AP.spectrogram(wav)\n", "print(\"Max:\", spec.max())\n", @@ -155,7 +176,9 @@ "\n", "wav_gen = AP.inv_spectrogram(spec)\n", "ipd.Audio(wav_gen, rate=AP.sample_rate)" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -170,54 +193,21 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "Collapsed": "false" }, - "outputs": [], - "source": [ - "audio={\n", - " 'audio_processor': 'audio',\n", - " 'num_mels': 80, # In general, you don'tneed to change it \n", - " 'num_freq': 1025, # In general, you don'tneed to change it \n", - " 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n", - " 'frame_length_ms': 50, # In general, you don'tneed to change it \n", - " 'frame_shift_ms': 12.5, # In general, you don'tneed to change it \n", - " 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n", - " 'min_level_db': -100,\n", - " 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n", - " 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n", - " 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n", - " 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n", - " 'symmetric_norm': False, # Same as above\n", - " 'max_norm': 1, # Same as above\n", - " 'clip_norm': True, # Same as above\n", - " 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", - " 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", - " 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", - "\n", - "AP = AudioProcessor(**audio);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], "source": [ "from librosa import display\n", "from matplotlib import pylab as plt\n", "import IPython\n", "plt.rcParams['figure.figsize'] = (20.0, 16.0)\n", "\n", - "def compare_values(attribute, values, file):\n", + "def compare_values(attribute, values):\n", " \"\"\"\n", " attributes (str): the names of the attribute you like to test.\n", " values (list): list of values to compare.\n", - " file (str): file name to perform the tests.\n", " \"\"\"\n", + " file = SAMPLE_FILE_PATH\n", " wavs = []\n", " for idx, val in enumerate(values):\n", " set_val_cmd = \"AP.{}={}\".format(attribute, val)\n", @@ -245,29 +235,31 @@ " val = values[idx]\n", " print(\" > {} = {}\".format(attribute, val))\n", " IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": null, "metadata": { "Collapsed": "false" }, - "outputs": [], "source": [ - "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99], file_paths[10])" - ] + "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])" + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": null, "metadata": { "Collapsed": "false" }, - "outputs": [], "source": [ - "compare_values(\"ref_level_db\", [10, 15, 20, 25, 30, 35, 40], file_paths[10])" - ] + "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 100])" + ], + "execution_count": null, + "outputs": [] } ], "metadata": {