Clean-up `CheckSpectrograms` Notebook

- Fixes how parameters from the config are loaded while still making it
  possible for them to be overridden on the fly.
- Prints the list of audio files by index.
- Defines a constant to control which audio files gets loaded for all
  sections.
- Eliminates having to load the audio processor twice.
- Removes output (since it isn't relevant).
pull/393/head
Guy Elsmore-Paddock 2021-03-20 21:30:48 -04:00
parent 0601f6fc0f
commit 49484f0937
1 changed files with 79 additions and 87 deletions

View File

@ -2,11 +2,9 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"\n",
@ -16,21 +14,32 @@
"\n",
"import IPython.display as ipd\n",
"import glob"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"config_path = \"/home/erogol/Projects/TTS/tts/tts/config_thorsten_de.json\"\n",
"data_path = \"/home/erogol/Data/thorsten-german/\"\n",
"\n",
"file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
"CONFIG = load_config(config_path)"
]
"CONFIG = load_config(config_path)\n",
"\n",
"# Change this to the index of the desired file listed below\n",
"sample_file_index = 10\n",
"\n",
"SAMPLE_FILE_PATH = file_paths[sample_file_index]\n",
"\n",
"print(\"File list, by index:\")\n",
"dict(enumerate(file_paths))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
@ -39,39 +48,54 @@
},
"source": [
"### Setup Audio Processor\n",
"Play with the AP parameters until you find a good fit with the synthesis speech below. "
"Play with the AP parameters until you find a good fit with the synthesis speech below.\n",
"\n",
"The default values are loaded from your config.json file, so you only need to\n",
"uncomment and modify values below that you'd like to tune."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"# audio={\n",
"tune_params={\n",
"# 'audio_processor': 'audio',\n",
"# 'num_mels': 80, # In general, you don'tneed to change it \n",
"# 'fft_size': 1024, # In general, you don'tneed to change it \n",
"# 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n",
"# 'hop_length': 256, # In general, you don'tneed to change it \n",
"# 'win_length': 1024, # In general, you don'tneed to change it \n",
"# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
"# 'num_mels': 80, # In general, you don't need to change this. \n",
"# 'fft_size': 1024, # In general, you don't need to change this.\n",
"# 'sample_rate': 22050, # This must match the sample rate of the dataset.\n",
"# 'hop_length': 256, # In general, you don't need to change this.\n",
"# 'win_length': 1024, # In general, you don't need to change this.\n",
"# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
"# 'min_level_db': -100,\n",
"# 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
"# 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
"# 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
"# 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n",
"# 'symmetric_norm': False, # Same as above\n",
"# 'max_norm': 1, # Same as above\n",
"# 'clip_norm': True, # Same as above\n",
"# 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
"# 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
"# 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
"# 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
"# 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
"# 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
"# 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
"# 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
"# 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
"}\n",
"\n",
"AP = AudioProcessor(**CONFIG.audio);"
]
"# These options have to be forced off in order to avoid errors about the \n",
"# pre-calculated not matching the options being tuned.\n",
"reset={\n",
" 'signal_norm': False,\n",
" 'stats_path': None,\n",
" 'symmetric_norm': False,\n",
" 'max_norm': 1,\n",
" 'clip_norm': True,\n",
"}\n",
"\n",
"# Override select parts of loaded config with parameters above\n",
"tuned_config = CONFIG.audio.copy()\n",
"tuned_config.update(reset)\n",
"tuned_config.update(tune_params)\n",
"\n",
"AP = AudioProcessor(**tuned_config);"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
@ -84,15 +108,15 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"wav = AP.load_wav(file_paths[10])\n",
"wav = AP.load_wav(SAMPLE_FILE_PATH)\n",
"ipd.Audio(data=wav, rate=AP.sample_rate) "
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
@ -105,20 +129,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"AP.power = 1.0"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"mel = AP.melspectrogram(wav)\n",
"print(\"Max:\", mel.max())\n",
@ -128,7 +149,9 @@
"\n",
"wav_gen = AP.inv_melspectrogram(mel)\n",
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
@ -141,11 +164,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"spec = AP.spectrogram(wav)\n",
"print(\"Max:\", spec.max())\n",
@ -155,7 +176,9 @@
"\n",
"wav_gen = AP.inv_spectrogram(spec)\n",
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
@ -170,54 +193,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"audio={\n",
" 'audio_processor': 'audio',\n",
" 'num_mels': 80, # In general, you don'tneed to change it \n",
" 'num_freq': 1025, # In general, you don'tneed to change it \n",
" 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n",
" 'frame_length_ms': 50, # In general, you don'tneed to change it \n",
" 'frame_shift_ms': 12.5, # In general, you don'tneed to change it \n",
" 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
" 'min_level_db': -100,\n",
" 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
" 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
" 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
" 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n",
" 'symmetric_norm': False, # Same as above\n",
" 'max_norm': 1, # Same as above\n",
" 'clip_norm': True, # Same as above\n",
" 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
" 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
" 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
"\n",
"AP = AudioProcessor(**audio);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"from librosa import display\n",
"from matplotlib import pylab as plt\n",
"import IPython\n",
"plt.rcParams['figure.figsize'] = (20.0, 16.0)\n",
"\n",
"def compare_values(attribute, values, file):\n",
"def compare_values(attribute, values):\n",
" \"\"\"\n",
" attributes (str): the names of the attribute you like to test.\n",
" values (list): list of values to compare.\n",
" file (str): file name to perform the tests.\n",
" \"\"\"\n",
" file = SAMPLE_FILE_PATH\n",
" wavs = []\n",
" for idx, val in enumerate(values):\n",
" set_val_cmd = \"AP.{}={}\".format(attribute, val)\n",
@ -245,29 +235,31 @@
" val = values[idx]\n",
" print(\" > {} = {}\".format(attribute, val))\n",
" IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
]
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99], file_paths[10])"
]
"compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"compare_values(\"ref_level_db\", [10, 15, 20, 25, 30, 35, 40], file_paths[10])"
]
"compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 100])"
],
"execution_count": null,
"outputs": []
}
],
"metadata": {