mirror of https://github.com/coqui-ai/TTS.git
353 lines
8.5 KiB
Plaintext
353 lines
8.5 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"TTS_PATH = \"/home/erogol/projects/\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import sys\n",
|
|
"sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
|
|
"import glob\n",
|
|
"import librosa\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"from scipy.stats import norm\n",
|
|
"from tqdm import tqdm_notebook as tqdm\n",
|
|
"from multiprocessing import Pool\n",
|
|
"from matplotlib import pylab as plt\n",
|
|
"from collections import Counter\n",
|
|
"from TTS.tts.datasets.preprocess import *\n",
|
|
"%matplotlib inline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"DATA_PATH = \"/home/erogol/Data/m-ai-labs/de_DE/by_book/male/karlsson/\"\n",
|
|
"META_DATA = [\"kleinzaches/metadata.csv\",\n",
|
|
" \"spiegel_kaetzchen/metadata.csv\",\n",
|
|
" \"herrnarnesschatz/metadata.csv\",\n",
|
|
" \"maedchen_von_moorhof/metadata.csv\",\n",
|
|
" \"koenigsgaukler/metadata.csv\",\n",
|
|
" \"altehous/metadata.csv\",\n",
|
|
" \"odysseus/metadata.csv\",\n",
|
|
" \"undine/metadata.csv\",\n",
|
|
" \"reise_tilsit/metadata.csv\",\n",
|
|
" \"schmied_seines_glueckes/metadata.csv\",\n",
|
|
" \"kammmacher/metadata.csv\",\n",
|
|
" \"unterm_birnbaum/metadata.csv\",\n",
|
|
" \"liebesbriefe/metadata.csv\",\n",
|
|
" \"sandmann/metadata.csv\"]\n",
|
|
"NUM_PROC = 8"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n",
|
|
"items = mailabs(DATA_PATH, META_DATA)\n",
|
|
"print(\" > Number of audio files: {}\".format(len(items)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# check wavs if exist\n",
|
|
"wav_files = []\n",
|
|
"for item in items:\n",
|
|
" wav_file = item[1].strip()\n",
|
|
" wav_files.append(wav_file)\n",
|
|
" if not os.path.exists(wav_file):\n",
|
|
" print(waf_path)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# show duplicate items\n",
|
|
"c = Counter(wav_files)\n",
|
|
"print([item for item, count in c.items() if count > 1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def load_item(item):\n",
|
|
" file_name = item[1].strip()\n",
|
|
" text = item[0].strip()\n",
|
|
" audio = librosa.load(file_name, sr=None)\n",
|
|
" sr = audio[1]\n",
|
|
" audio = audio[0]\n",
|
|
" audio_len = len(audio) / sr\n",
|
|
" text_len = len(text)\n",
|
|
" return file_name, text, text_len, audio, audio_len\n",
|
|
"\n",
|
|
"# This will take a while depending on size of dataset\n",
|
|
"if NUM_PROC == 1:\n",
|
|
" data = []\n",
|
|
" for m in tqdm(items):\n",
|
|
" data += [load_item(m)]\n",
|
|
"else:\n",
|
|
" with Pool(8) as p:\n",
|
|
" data = list(tqdm(p.imap(load_item, items), total=len(items)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# count words in the dataset\n",
|
|
"w_count = Counter()\n",
|
|
"for item in tqdm(data):\n",
|
|
" text = item[1].lower().strip()\n",
|
|
" for word in text.split():\n",
|
|
" w_count[word] += 1\n",
|
|
"print(\" > Number of words: {}\".format(len(w_count)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"text_vs_durs = {} # text length vs audio duration\n",
|
|
"text_len_counter = Counter() # number of sentences with the keyed length\n",
|
|
"for item in tqdm(data):\n",
|
|
" text = item[1].lower().strip()\n",
|
|
" text_len = len(text)\n",
|
|
" text_len_counter[text_len] += 1\n",
|
|
" audio_len = item[-1]\n",
|
|
" try:\n",
|
|
" text_vs_durs[text_len] += [audio_len]\n",
|
|
" except:\n",
|
|
" text_vs_durs[text_len] = [audio_len]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# text_len vs avg_audio_len, median_audio_len, std_audio_len\n",
|
|
"text_vs_avg = {}\n",
|
|
"text_vs_median = {}\n",
|
|
"text_vs_std = {}\n",
|
|
"for key, durs in text_vs_durs.items():\n",
|
|
" text_vs_avg[key] = np.mean(durs)\n",
|
|
" text_vs_median[key] = np.median(durs)\n",
|
|
" text_vs_std[key] = np.std(durs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Avg audio length per char"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for item in data:\n",
|
|
" if item[-1] < 2:\n",
|
|
" print(item)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"sec_per_chars = []\n",
|
|
"for item in data:\n",
|
|
" text = item[1]\n",
|
|
" dur = item[-1]\n",
|
|
" sec_per_char = dur / len(text)\n",
|
|
" sec_per_chars.append(sec_per_char)\n",
|
|
"# sec_per_char /= len(data)\n",
|
|
"# print(sec_per_char)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mean = np.mean(sec_per_chars)\n",
|
|
"std = np.std(sec_per_chars)\n",
|
|
"print(mean)\n",
|
|
"print(std)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"dist = norm(mean, std)\n",
|
|
"\n",
|
|
"# find irregular instances long or short voice durations\n",
|
|
"for item in data:\n",
|
|
" text = item[1]\n",
|
|
" dur = item[-1]\n",
|
|
" sec_per_char = dur / len(text)\n",
|
|
" pdf =norm.pdf(sec_per_char)\n",
|
|
" if pdf < 0.39:\n",
|
|
" print(item)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Plot Dataset Statistics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.title(\"text length vs mean audio duration\")\n",
|
|
"plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.title(\"text length vs median audio duration\")\n",
|
|
"plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.title(\"text length vs STD\")\n",
|
|
"plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.title(\"text length vs # instances\")\n",
|
|
"plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Check words frequencies"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"w_count_df = pd.DataFrame.from_dict(w_count, orient='index')\n",
|
|
"w_count_df.sort_values(0, ascending=False, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"w_count_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# check a certain word\n",
|
|
"w_count_df.at['minute', 0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# fequency bar plot - it takes time!!\n",
|
|
"w_count_df.plot.bar()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|