mirror of https://github.com/coqui-ai/TTS.git
222 lines
5.9 KiB
Plaintext
222 lines
5.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"This notebook computes the average SNR a given Voice Dataset. If the SNR is too low, that might reduce the performance or prevent model to learn.\n",
|
|
"\n",
|
|
"To use this notebook, you need:\n",
|
|
"- WADA SNR estimation: http://www.cs.cmu.edu/~robust/archive/algorithms/WADA_SNR_IS_2008/\n",
|
|
" 1. extract in the same folder as this notebook\n",
|
|
" 2. under MacOS you'll have to rebuild the executable. In the build folder: 1) remove existing .o files and 2) run make\n",
|
|
"\n",
|
|
"\n",
|
|
"- FFMPEG: ```sudo apt-get install ffmpeg ``` \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os, sys\n",
|
|
"import glob\n",
|
|
"import subprocess\n",
|
|
"import tempfile\n",
|
|
"import IPython\n",
|
|
"import soundfile as sf\n",
|
|
"import numpy as np\n",
|
|
"from tqdm import tqdm\n",
|
|
"from multiprocessing import Pool\n",
|
|
"from matplotlib import pylab as plt\n",
|
|
"%matplotlib inline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Set the meta parameters\n",
|
|
"DATA_PATH = \"/home/erogol/Data/m-ai-labs/de_DE/by_book/female/eva_k/\"\n",
|
|
"NUM_PROC = 1\n",
|
|
"CURRENT_PATH = os.getcwd()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def compute_file_snr(file_path):\n",
|
|
" \"\"\" Convert given file to required format with FFMPEG and process with WADA.\"\"\"\n",
|
|
" _, sr = sf.read(file_path)\n",
|
|
" new_file = file_path.replace(\".wav\", \"_tmp.wav\")\n",
|
|
" if sr != 16000:\n",
|
|
" command = f'ffmpeg -i \"{file_path}\" -ac 1 -acodec pcm_s16le -y -ar 16000 \"{new_file}\"'\n",
|
|
" else:\n",
|
|
" command = f'cp \"{file_path}\" \"{new_file}\"'\n",
|
|
" os.system(command)\n",
|
|
" command = [f'\"{CURRENT_PATH}/WadaSNR/Exe/WADASNR\"', f'-i \"{new_file}\"', f'-t \"{CURRENT_PATH}/WadaSNR/Exe/Alpha0.400000.txt\"', '-ifmt mswav']\n",
|
|
" output = subprocess.check_output(\" \".join(command), shell=True)\n",
|
|
" try:\n",
|
|
" output = float(output.split()[-3].decode(\"utf-8\"))\n",
|
|
" except:\n",
|
|
" raise RuntimeError(\" \".join(command))\n",
|
|
" os.system(f'rm \"{new_file}\"')\n",
|
|
" return output, file_path\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"wav_file = \"/home/erogol/Data/LJSpeech-1.1/wavs/LJ001-0001.wav\"\n",
|
|
"output = compute_file_snr(wav_file)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"wav_files = glob.glob(f\"{DATA_PATH}/**/*.wav\", recursive=True)\n",
|
|
"print(f\" > Number of wav files {len(wav_files)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"if NUM_PROC == 1:\n",
|
|
" file_snrs = [None] * len(wav_files) \n",
|
|
" for idx, wav_file in tqdm(enumerate(wav_files)):\n",
|
|
" tup = compute_file_snr(wav_file)\n",
|
|
" file_snrs[idx] = tup\n",
|
|
"else:\n",
|
|
" with Pool(NUM_PROC) as pool:\n",
|
|
" file_snrs = list(tqdm(pool.imap(compute_file_snr, wav_files), total=len(wav_files)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"snrs = [tup[0] for tup in file_snrs]\n",
|
|
"\n",
|
|
"error_idxs = np.where(np.isnan(snrs) == True)[0]\n",
|
|
"error_files = [file_names[idx] for idx in error_idxs]\n",
|
|
"\n",
|
|
"file_snrs = [i for j, i in enumerate(file_snrs) if j not in error_idxs]\n",
|
|
"file_names = [tup[1] for tup in file_snrs]\n",
|
|
"snrs = [tup[0] for tup in file_snrs]\n",
|
|
"file_idxs = np.argsort(snrs)\n",
|
|
"\n",
|
|
"\n",
|
|
"print(f\" > Average SNR of the dataset:{np.mean(snrs)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def output_snr_with_audio(idx):\n",
|
|
" file_idx = file_idxs[idx]\n",
|
|
" file_name = file_names[file_idx]\n",
|
|
" wav, sr = sf.read(file_name)\n",
|
|
" # multi channel to single channel\n",
|
|
" if len(wav.shape) == 2:\n",
|
|
" wav = wav[:, 0]\n",
|
|
" print(f\" > {file_name} - snr:{snrs[file_idx]}\")\n",
|
|
" IPython.display.display(IPython.display.Audio(wav, rate=sr))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# find worse SNR files\n",
|
|
"N = 10 # number of files to fetch\n",
|
|
"for i in range(N):\n",
|
|
" output_snr_with_audio(i)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# find best recordings\n",
|
|
"N = 10 # number of files to fetch\n",
|
|
"for i in range(N):\n",
|
|
" output_snr_with_audio(-i-1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.hist(snrs, bins=100)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3(mztts)",
|
|
"language": "python",
|
|
"name": "mztts"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.8"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|