mirror of https://github.com/coqui-ai/TTS.git
251 lines
7.2 KiB
Plaintext
251 lines
7.2 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"Collapsed": "false"
|
||
},
|
||
"source": [
|
||
"# Jupyter Notbook for phoneme coverage analysis\n",
|
||
"\n",
|
||
"This jupyter notebook checks dataset configured in config.json for phoneme coverage.\n",
|
||
"As mentioned here https://github.com/mozilla/TTS/wiki/Dataset#what-makes-a-good-dataset a good phoneme coverage is recommended.\n",
|
||
"\n",
|
||
"Most parameters will be taken from config.json file in mozilla tts repo so please ensure it's configured correctly for your dataset.\n",
|
||
"This notebook used lots of existring code from the TTS repo to ensure future compatibility.\n",
|
||
"\n",
|
||
"Many thanks to Neil Stoker supporting me on this topic :-).\n",
|
||
"\n",
|
||
"I provide this notebook without any warrenty but it's hopefully useful for your dataset analysis.\n",
|
||
"\n",
|
||
"Happy TTS'ing :-)\n",
|
||
"\n",
|
||
"Thorsten Müller\n",
|
||
"\n",
|
||
"* https://github.com/thorstenMueller/deep-learning-german-tts\n",
|
||
"* https://discourse.mozilla.org/t/contributing-my-german-voice-for-tts/"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"Collapsed": "false"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# set some vars\n",
|
||
"# TTS_PATH = \"/home/thorsten/___dev/tts/mozilla/TTS\"\n",
|
||
"CONFIG_FILE = \"/path/to/config/config.json\"\n",
|
||
"CHARS_TO_REMOVE = \".,:!?'\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"Collapsed": "false"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# import stuff\n",
|
||
"from TTS.utils.io import load_config\n",
|
||
"from TTS.tts.datasets.preprocess import load_meta_data\n",
|
||
"from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme\n",
|
||
"from tqdm import tqdm\n",
|
||
"from matplotlib import pylab as plt\n",
|
||
"from multiprocessing import Pool, cpu_count\n",
|
||
"\n",
|
||
"# extra imports that might not be included in requirements.txt\n",
|
||
"import collections\n",
|
||
"import operator\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"Collapsed": "false",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Load config.json properties\n",
|
||
"CONFIG = load_config(CONFIG_FILE)\n",
|
||
"\n",
|
||
"# Load some properties from config.json\n",
|
||
"CONFIG_METADATA = sorted(load_meta_data(CONFIG.datasets)[0])\n",
|
||
"CONFIG_METADATA = CONFIG_METADATA\n",
|
||
"CONFIG_DATASET = CONFIG.datasets[0]\n",
|
||
"CONFIG_PHONEME_LANGUAGE = CONFIG.phoneme_language\n",
|
||
"CONFIG_TEXT_CLEANER = CONFIG.text_cleaner\n",
|
||
"CONFIG_ENABLE_EOS_BOS_CHARS = CONFIG.enable_eos_bos_chars\n",
|
||
"\n",
|
||
"# Will be printed on generated output graph\n",
|
||
"CONFIG_RUN_NAME = CONFIG.run_name\n",
|
||
"CONFIG_RUN_DESC = CONFIG.run_description"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"Collapsed": "false",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# print some debug information on loaded config values\n",
|
||
"print(\" > Run name: \" + CONFIG_RUN_NAME + \" (\" + CONFIG_RUN_DESC + \")\")\n",
|
||
"print(\" > Dataset files: \" + str(len(CONFIG_METADATA)))\n",
|
||
"print(\" > Phoneme language: \" + CONFIG_PHONEME_LANGUAGE)\n",
|
||
"print(\" > Used text cleaner: \" + CONFIG_TEXT_CLEANER)\n",
|
||
"print(\" > Enable eos bos chars: \" + str(CONFIG_ENABLE_EOS_BOS_CHARS))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def get_phoneme_from_sequence(text):\n",
|
||
" temp_list = []\n",
|
||
" if len(text[0]) > 0:\n",
|
||
" temp_text = text[0].rstrip('\\n')\n",
|
||
" for rm_bad_chars in CHARS_TO_REMOVE:\n",
|
||
" temp_text = temp_text.replace(rm_bad_chars,\"\")\n",
|
||
" seq = phoneme_to_sequence(temp_text, [CONFIG_TEXT_CLEANER], CONFIG_PHONEME_LANGUAGE, CONFIG_ENABLE_EOS_BOS_CHARS)\n",
|
||
" text = sequence_to_phoneme(seq)\n",
|
||
" text = text.replace(\" \",\"\")\n",
|
||
" temp_list.append(text)\n",
|
||
" return temp_list"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"Collapsed": "false",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Get phonemes from metadata\n",
|
||
"phonemes = []\n",
|
||
"\n",
|
||
"with Pool(cpu_count()-1) as p:\n",
|
||
" \n",
|
||
" phonemes = list(tqdm(p.imap(get_phoneme_from_sequence, CONFIG_METADATA), total=len(CONFIG_METADATA)))\n",
|
||
" phonemes = [i for sub in phonemes for i in sub]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"Collapsed": "false",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"s = \"\"\n",
|
||
"phonemeString = s.join(phonemes)\n",
|
||
"\n",
|
||
"d = {}\n",
|
||
"collections._count_elements(d, phonemeString)\n",
|
||
"sorted_d = dict(sorted(d.items(), key=operator.itemgetter(1),reverse=True))\n",
|
||
"\n",
|
||
"# remove useless keys\n",
|
||
"sorted_d.pop(' ', None)\n",
|
||
"sorted_d.pop('ˈ', None)\n",
|
||
"\n",
|
||
"phonemesSum = len(phonemeString.replace(\" \",\"\"))\n",
|
||
"\n",
|
||
"print(\"Dataset contains \" + str(len(sorted_d)) + \" different ipa phonemes.\")\n",
|
||
"print(\"Dataset consists of \" + str(phonemesSum) + \" phonemes\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"Collapsed": "false",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"print(\"5 rarest phonemes\")\n",
|
||
"\n",
|
||
"rareList = dict(sorted(sorted_d.items(), key=operator.itemgetter(1), reverse=False)[:5])\n",
|
||
"for key, value in rareList.items():\n",
|
||
" print(key + \" --> \" + str(value) + \" occurrences\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"Collapsed": "false"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# create plot from analysis result\n",
|
||
"\n",
|
||
"x = []\n",
|
||
"y = []\n",
|
||
"\n",
|
||
"for key, value in sorted_d.items():\n",
|
||
" x.append(key)\n",
|
||
" y.append(value)\n",
|
||
"\n",
|
||
"plt.figure(figsize=(50,50))\n",
|
||
"plt.title(\"Phoneme coverage for \" + CONFIG_RUN_NAME + \" (\" + CONFIG_RUN_DESC + \")\", fontsize=50)\n",
|
||
"plt.xticks(fontsize=50)\n",
|
||
"plt.yticks(fontsize=50)\n",
|
||
"plt.barh(x,y, align='center', alpha=1.0)\n",
|
||
"plt.gca().invert_yaxis()\n",
|
||
"plt.ylabel('phoneme', fontsize=50)\n",
|
||
"plt.xlabel('occurrences', fontsize=50)\n",
|
||
"\n",
|
||
"for i, v in enumerate(y):\n",
|
||
" plt.text(v + 2, i - .2, str(v), fontsize=20)\n",
|
||
" plt.text(v + 2, i + .2, \"(\" + str(round(100/phonemesSum * v,2)) + \"%)\", fontsize=20)\n",
|
||
" \n",
|
||
" \n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"Collapsed": "false"
|
||
},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.6.9-final"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
} |