mirror of https://github.com/coqui-ai/TTS.git
287 lines
294 KiB
Plaintext
287 lines
294 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Jupyter Notbook for phoneme coverage analysis\n",
|
|||
|
"\n",
|
|||
|
"This jupyter notebook checks dataset configured in config.json for phoneme coverage.\n",
|
|||
|
"As mentioned here https://github.com/mozilla/TTS/wiki/Dataset#what-makes-a-good-dataset a good phoneme coverage is recommended.\n",
|
|||
|
"\n",
|
|||
|
"Most parameters will be taken from config.json file in mozilla tts repo so please ensure it's configured correctly for your dataset.\n",
|
|||
|
"This notebook used lots of existring code from the TTS repo to ensure future compatibility.\n",
|
|||
|
"\n",
|
|||
|
"Many thanks to Neil Stoker supporting me on this topic :-).\n",
|
|||
|
"\n",
|
|||
|
"I provide this notebook without any warrenty but it's hopefully useful for your dataset analysis.\n",
|
|||
|
"\n",
|
|||
|
"Happy TTS'ing :-)\n",
|
|||
|
"\n",
|
|||
|
"Thorsten Müller\n",
|
|||
|
"\n",
|
|||
|
"* https://github.com/thorstenMueller/deep-learning-german-tts\n",
|
|||
|
"* https://discourse.mozilla.org/t/contributing-my-german-voice-for-tts/"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# set some vars\n",
|
|||
|
"TTS_PATH = \"/home/thorsten/___dev/tts/mozilla/TTS\"\n",
|
|||
|
"CONFIG_FILE = \"/home/thorsten/___dev/tts/mozilla/TTS/TTS/tts/configs/config.json\"\n",
|
|||
|
"CHARS_TO_REMOVE = \".,:!?'\""
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"/home/thorsten/___dev/tts/mozilla/TTS\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"cd $TTS_PATH"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# import stuff\n",
|
|||
|
"from TTS.utils.io import load_config\n",
|
|||
|
"from TTS.tts.datasets.preprocess import load_meta_data\n",
|
|||
|
"from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme\n",
|
|||
|
"from tqdm import tqdm\n",
|
|||
|
"from matplotlib import pylab as plt\n",
|
|||
|
"\n",
|
|||
|
"# extra imports that might not be included in requirements.txt\n",
|
|||
|
"import collections\n",
|
|||
|
"import operator\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Load config.json properties\n",
|
|||
|
"CONFIG = load_config(CONFIG_FILE)\n",
|
|||
|
"\n",
|
|||
|
"# Load some properties from config.json\n",
|
|||
|
"CONFIG_METADATA = load_meta_data(CONFIG.datasets)[0]\n",
|
|||
|
"CONFIG_DATASET = CONFIG.datasets[0]\n",
|
|||
|
"CONFIG_PHONEME_LANGUAGE = CONFIG.phoneme_language\n",
|
|||
|
"CONFIG_TEXT_CLEANER = CONFIG.text_cleaner\n",
|
|||
|
"CONFIG_ENABLE_EOS_BOS_CHARS = CONFIG.enable_eos_bos_chars\n",
|
|||
|
"\n",
|
|||
|
"# Will be printed on generated output graph\n",
|
|||
|
"CONFIG_RUN_NAME = CONFIG.run_name\n",
|
|||
|
"CONFIG_RUN_DESC = CONFIG.run_description"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" > Run name: thorsten-de (github.com/thorstenMueller/deep-learning-german-tts)\n",
|
|||
|
" > Dataset files: 101\n",
|
|||
|
" > Phoneme language: de\n",
|
|||
|
" > Used text cleaner: phoneme_cleaners\n",
|
|||
|
" > Enable eos bos chars: False\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# print some debug information on loaded config values\n",
|
|||
|
"print(\" > Run name: \" + CONFIG_RUN_NAME + \" (\" + CONFIG_RUN_DESC + \")\")\n",
|
|||
|
"print(\" > Dataset files: \" + str(len(CONFIG_METADATA)))\n",
|
|||
|
"print(\" > Phoneme language: \" + CONFIG_PHONEME_LANGUAGE)\n",
|
|||
|
"print(\" > Used text cleaner: \" + CONFIG_TEXT_CLEANER)\n",
|
|||
|
"print(\" > Enable eos bos chars: \" + str(CONFIG_ENABLE_EOS_BOS_CHARS))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" 77%|███████▋ | 78/101 [00:06<00:01, 13.03it/s][WARNING] 1 utterances containing language switches on lines 1\n",
|
|||
|
"[WARNING] extra phones may appear in the \"de\" phoneset\n",
|
|||
|
"[WARNING] language switch flags have been kept (applying \"keep-flags\" policy)\n",
|
|||
|
"100%|██████████| 101/101 [00:07<00:00, 12.87it/s]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Get phonemes from metadata\n",
|
|||
|
"phonemes = []\n",
|
|||
|
"\n",
|
|||
|
"for phrase in tqdm(CONFIG_METADATA):\n",
|
|||
|
" if len(phrase[0]) > 0:\n",
|
|||
|
" tmpPhrase = phrase[0].rstrip('\\n')\n",
|
|||
|
" for removeChar in CHARS_TO_REMOVE:\n",
|
|||
|
" tmpPhrase = tmpPhrase.replace(removeChar,\"\")\n",
|
|||
|
" \n",
|
|||
|
" seq = phoneme_to_sequence(tmpPhrase, [CONFIG_TEXT_CLEANER], CONFIG_PHONEME_LANGUAGE, CONFIG_ENABLE_EOS_BOS_CHARS)\n",
|
|||
|
" text = sequence_to_phoneme(seq)\n",
|
|||
|
" text = text.replace(\" \",\"\")\n",
|
|||
|
" phonemes.append(text)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Dataset contains 39 different ipa phonemes.\n",
|
|||
|
"Dataset consists of 2481 phonemes\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"s = \"\"\n",
|
|||
|
"phonemeString = s.join(phonemes)\n",
|
|||
|
"\n",
|
|||
|
"d = {}\n",
|
|||
|
"collections._count_elements(d, phonemeString)\n",
|
|||
|
"sorted_d = dict(sorted(d.items(), key=operator.itemgetter(1),reverse=True))\n",
|
|||
|
"\n",
|
|||
|
"# remove useless keys\n",
|
|||
|
"sorted_d.pop(' ', None)\n",
|
|||
|
"sorted_d.pop('ˈ', None)\n",
|
|||
|
"\n",
|
|||
|
"phonemesSum = len(phonemeString.replace(\" \",\"\"))\n",
|
|||
|
"\n",
|
|||
|
"print(\"Dataset contains \" + str(len(sorted_d)) + \" different ipa phonemes.\")\n",
|
|||
|
"print(\"Dataset consists of \" + str(phonemesSum) + \" phonemes\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"5 rarest phonemes\n",
|
|||
|
"y --> 1 occurrences\n",
|
|||
|
"ø --> 2 occurrences\n",
|
|||
|
"( --> 2 occurrences\n",
|
|||
|
") --> 2 occurrences\n",
|
|||
|
"j --> 3 occurrences\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(\"5 rarest phonemes\")\n",
|
|||
|
"\n",
|
|||
|
"rareList = dict(sorted(sorted_d.items(), key=operator.itemgetter(1), reverse=False)[:5])\n",
|
|||
|
"for key, value in rareList.items():\n",
|
|||
|
" print(key + \" --> \" + str(value) + \" occurrences\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAC2AAAAtHCAYAAADZF5EIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOydd7gdVbXAfysJAULvPQTpVXoRhQCKgDRB0Yciodh7BxSN2FFBfaCgiBGUJ1UERFTKBZHepdfQSyihhhSy3h9rLpm7zz7nzMyZc2bOvev3ffMlM/fsMrPb2muvvbaoKo7jOI7jOI7jOI7jOI7jOI7jOI7jOI7jOI7jOI7jOI7jOE57RlWdAcdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxnH7BDbAdx3Ecx3Ecx3Ecx3Ecx3Ecx3Ecx3Ecx3Ecx3Ecx3Ecx3Ey4gbYjuM4juM4juM4juM4juM4juM4juM4juM4juM4juM4juM4GXEDbMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxHMdxnIy4AbbjOI7jOI7jOI7jOI7jOI7jOI7jOI7jOI7jOI7jOI7jOE5G3ADbcRzHcRzHcRzHcRzHcRzHcRzHcRzHcRzHcRzHcRzHcRwnI26A7TiO4xRGRCaJiAbXhKrz5TiO4zj9jogsJiKfEZEzReQ+EXleRN4Ixtxbqs7nSEFEBoJvP1B1nhxnJCIiEyPzj4lV56sKROSjwXf4cdV5akcd+tKRXoci7z656jw5Ti8QkalB3Z9SdZ4cx3EGKTo++7heL+og6zq9RUSmBGU+teo8Ob3D+2DHcZzhgYgsJyIvp/rz+0Vk/qrz5ThOf+EG2I7jOI7jOI7jODVCRD4BPAr8L/A+YA1gCXz+5jiO4wAisiTww9Sj54AfVJQdx3Ecx3Ecx3Ecx3Ecx3GcvkNVnwaOST1aHfhqRdlxHKdP8QV8x3GcYYCITIjstm51zRCRp0TkHhE5X0Qmi8hOIuLjguM4juNUiIgcCfwaWKTqvDiO4zi15QfAUqn776nqi1VlxnEcJw8i8pVAR3Vu1XlynOFKk9MLB6+5IrJaSekc2UoXXUYajlMEEbk9qI97V50nx3Ecx3Ecp3b8FHgmdX+4iIyvKjOO4/QfbmjnOI4zMlkAWA5YC9gd+DZwMfCAiHxZRMZUmTnHcRzHGYmIyGbA5MifFHgEuA24NXXd07PM9RlNDA0mVJ0vp3f4UcDOcEVE1gUOTT16Gjihouz0BBGZGrTnKVXnyXGcjtgruG8wwE4cBbgB5whGRAaCOjBQdZ6GIQIc1HEkIqXE4zhlIyKrA+unHs0A/llRdhzHcRzHcWpH2eso/aqTV9WXgWNTj8YBR1WUHcdx+hA3wHYcx3HSTMB2+F0jImtWnBfHcRzHGWl8icY52g+BpVV1VVV9q6punLo+UEEeHcdxnGr5LjA6dX+sqr5eVWYcx3HyICLLAG9LPXoDuKCi7DiOAweWcCLiDkApnrQdp2TCDT//UtXXKsmJ4ziO4ziOU3d+BaRPGPxw4gjDcRynLW6A7TiOM3x5laFeMtPXfcALLcJuBlwsIit3O5OO4ziO40Cy6L1H8PhMVT1CVZ+vIk+O4zhOvRCRTYF9Uo9ewhYHHMdx+oU9GLomcZWqPltVZhzHYTywU4dxHFxGRhynC+wd3P+1klw4juM4juM4tUdVX2LoKYOjge9UlB3HcfqMMVVnwHEcx+kaN6jqxFY/SI7h+yDwBWDp4M/jgTOBbbqSO8dxHMdx0qwLLBI8O72KjDiO4zi15euApO5PTY7I7AvazU8dxxkRhN5I3RjOcXrPC8ASqfuDgX8ViUhEFmPo5jCA54Eli2XNccpBRJZm6IkLc4HzK8qO4zjDEFWV9r9yHMdx+owTga8xT/+6r4isrqoPVJgnx3H6APeA7TiOM4JR1QdU9fvAhsB1kZ9sLSLv73G2HMdxHGckMj7y7J6e58JxHMepJSKyKrBv8PjEKvLiOI5TBBEZB7wreHxuFXlxnBHOacH93iKyRPSX7dkfWDB1fxdwc8G4HKdMdse8Fg5ylapOqyozjuM4juM4Tv1R1YeAi1OPRmGODB3HcVriBtiO4zgOqvoUppR8KvLnj/c4O47jOI4zElks8uzVnufCcRzHqSufYagRyfWq+t+qMuM4jlOAnRlqqHmHe5FynEq4Brgzdb8AZkhdhIOD+98XjMdxymbv4N5PXHAcx3Ecx3Gy8Lvg/iARWbSSnDiO0ze4AbbjOI4DQOIB4ujIn96eeClyHMdxHKd7LBB5pj3PheM4jlM7RGQMcEDw+Mwq8uI4jtMBewX3bgznONURGkoflDcCEdkA2Dz1aA5wSieZcpwyEJEFaTxxwcccx3Ecx3EcJwsXADNS9wsBfmK84zgtGVN1BhzHcZxacQ5wTPBsfmB94PqikYrIfMCWwHrA0sBsYBpwL3Cdqr5RNO4WaS6epLk8sAzmLe6Z5Lq+V0cOJosRmyT5GA08CzwKXKmqXfFsmrz7FsBy2LvPn6Q7+O5PdiPdIA/jsUWYVbGJycvA/dhxjy/kiGdd7PutAIzF3uEB7PvN6UK+BdgQWB37dkthHminAVOx71d6umUgImOxb74ylu8lgJnAS9g3uzPxdl9GWssmaS2bXG9gZfM0cI2qvlRGOnVCRJYDNsXqxbLYRsZpzHvn53uQh/FYe1gVWAT77k8Dp6vqaxnCLwusg9XvxZnXNp8HHsf641e6lPf5gG2SvK+QPH4WuAO4oRvjQJN8rA2sjZXh0lgbmYb1y9eq6uu9yEcVJJuptgJWxOrxAti7PwPcrKqP9SgfawJvxfqqhYFZwFOqOiwW6kVkGWBr4C3Y+72IfeNrVfXhLqW5FiZjLcu8cesZrF5fp6qzu5FukIdxmNy1JrAkpmt4EbhMVe9oE3Z0Em5DrG4uislMr2Fj2KPAQ8ADqjq3W+/QCXWQH0RkFDZObYjVBcX62Qcx+WtWN9MvSvLtNsPqwIrAfNi4dDdWf3vSLydj7EZY+S2DjbHPAk8AV3drfMzBuzHZPs1ZZUWeGKlsi/XNy2Pv/xTwX+BWVR1xG4L6tU2VQdIuN8baxLJYn/wk8Aj27jMrzF5TRtp8KNCzLJU8fhq4SVVvzRHPopj+YG1sjvAq1v7/U6Z8mIz3uwePa2cMJyKrYN9jVWAcNiY9jX2Pp7uQ3qAMsSY2/izJPPnxIaw8uy7/pPRIg3PFUdi7/72dDJvUxXWBDbD8L4r1mTOA6Vjf8aCqTu1W/jslqZ+bYuW+DNZ/vITJcvdh86WujoUV6E5PBX7IvDXCzURkI1W9LUcchwT3F6rq01athxd1kPd7RTKebgGshI1xiwIvYO96p6re2SJ4WXkoPL9MeBfWhw9yl6reVyAfS2Hz+5Wwcp+B9Wk3JsfTd50qy6MmawxvweaLq2CnaEzDdJhX90IXWxW91l+KyBKYznhNbAxcGNPJDMpB1/Xqe5fQ/mNx9lS+K4Nez0frorvvlDroNuogMySy7caYXLkctg7wKnCbql5SMM5lsbWF1bC1qdeAh7H6mHmeLSKrMW/eviBWzx4GLi+zb0vqwlpY37Y0Nn7OwcbQZ4FbejU/Gcn6pW7Qi/mnqr4qIhcB7009/giNnrEdx3Hmoap++eWXX371+QVMwITL9DVQMK5XInHt1uS3kyK/nZD6+/LAzzGBN/zd4PUCcCywVAnfYQzwUeDf2ESqWZpzMYPyLwHzl/S9J6X+Pj/wRWyxrFkeZmIG7+uWVAcWTN7nqjbvrsDtwNeBhQqkM9CqngH7Jd+2WdqvY95wVm6RxvzA57BFrlb15gfAuJK+3+ZJvp5q8+1eAs4Gtuxmm86R79GYN8SLMAVKq7wrZsz0c2CTgnXsq8ANSRtqlsZs4ArsGNrRGeNeLgmXjufcEr7PCZH8rZ8j/BLAt4Gb2rzzG0m9/zgwX4F8Tg3im5L623zAJ4HbWqQ/oUm8iwAfAv6AKbHa1Y85yXscCowtqY6Ox5QSz7dI9zngJ8AyqXCTsr5nhjysBfw68p3Da0bSlnbuUfu
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 3600x3600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {
|
|||
|
"needs_background": "light"
|
|||
|
},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# create plot from analysis result\n",
|
|||
|
"\n",
|
|||
|
"x = []\n",
|
|||
|
"y = []\n",
|
|||
|
"\n",
|
|||
|
"for key, value in sorted_d.items():\n",
|
|||
|
" x.append(key)\n",
|
|||
|
" y.append(value)\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(50,50))\n",
|
|||
|
"plt.title(\"Phoneme coverage for \" + CONFIG_RUN_NAME + \" (\" + CONFIG_RUN_DESC + \")\", fontsize=50)\n",
|
|||
|
"plt.xticks(fontsize=50)\n",
|
|||
|
"plt.yticks(fontsize=50)\n",
|
|||
|
"plt.barh(x,y, align='center', alpha=1.0)\n",
|
|||
|
"plt.gca().invert_yaxis()\n",
|
|||
|
"plt.ylabel('phoneme', fontsize=50)\n",
|
|||
|
"plt.xlabel('occurrences', fontsize=50)\n",
|
|||
|
"\n",
|
|||
|
"for i, v in enumerate(y):\n",
|
|||
|
" plt.text(v + 2, i - .2, str(v), fontsize=20)\n",
|
|||
|
" plt.text(v + 2, i + .2, \"(\" + str(round(100/phonemesSum * v,2)) + \"%)\", fontsize=20)\n",
|
|||
|
" \n",
|
|||
|
" \n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.6.9"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 4
|
|||
|
}
|