mirror of https://github.com/coqui-ai/TTS.git
323 lines
9.3 KiB
Plaintext
323 lines
9.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Overview\n",
|
|
"\n",
|
|
"This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n",
|
|
"\n",
|
|
"Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import glob\n",
|
|
"import numpy as np\n",
|
|
"import umap\n",
|
|
"\n",
|
|
"from TTS.utils.audio import AudioProcessor\n",
|
|
"from TTS.config import load_config\n",
|
|
"\n",
|
|
"from bokeh.io import output_notebook, show\n",
|
|
"from bokeh.plotting import figure\n",
|
|
"from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
|
|
"from bokeh.transform import factor_cmap\n",
|
|
"from bokeh.palettes import Category10"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n",
|
|
"\n",
|
|
"List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n",
|
|
"\n",
|
|
"**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"output_notebook()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"You should also adjust all the path constants to point at the relevant locations for you locally"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
|
|
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n",
|
|
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
|
|
"\n",
|
|
"# My single speaker locations\n",
|
|
"#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n",
|
|
"#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n",
|
|
"\n",
|
|
"# My multi speaker locations\n",
|
|
"EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n",
|
|
"AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!ls -1 $MODEL_RUN_PATH"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"CONFIG = load_config(CONFIG_PATH)\n",
|
|
"ap = AudioProcessor(**CONFIG['audio'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Bring in the embeddings created by **compute_embeddings.py**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n",
|
|
"print(f'Embeddings found: {len(embed_files)}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Check that we did indeed find an embedding"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embed_files[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Process the speakers\n",
|
|
"\n",
|
|
"Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n",
|
|
"speaker_to_utter = {}\n",
|
|
"for embed_file in embed_files:\n",
|
|
" speaker_path = os.path.dirname(os.path.dirname(embed_file))\n",
|
|
" try:\n",
|
|
" speaker_to_utter[speaker_path].append(embed_file)\n",
|
|
" except:\n",
|
|
" speaker_to_utter[speaker_path]=[embed_file]\n",
|
|
"print(f'Speaker count: {len(speaker_paths)}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Set up the embeddings\n",
|
|
"\n",
|
|
"Adjust the number of speakers to select and the number of utterances from each speaker and they will be randomly sampled from the corpus"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embeds = []\n",
|
|
"labels = []\n",
|
|
"locations = []\n",
|
|
"\n",
|
|
"# single speaker \n",
|
|
"#num_speakers = 1\n",
|
|
"#num_utters = 1000\n",
|
|
"\n",
|
|
"# multi speaker\n",
|
|
"num_speakers = 10\n",
|
|
"num_utters = 20\n",
|
|
"\n",
|
|
"\n",
|
|
"speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n",
|
|
"\n",
|
|
"for speaker_num, speaker_idx in enumerate(speaker_idxs):\n",
|
|
" speaker_path = speaker_paths[speaker_idx]\n",
|
|
" speakers_utter = speaker_to_utter[speaker_path]\n",
|
|
" utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n",
|
|
" for utter_idx in utter_idxs:\n",
|
|
" embed_path = speaker_to_utter[speaker_path][utter_idx]\n",
|
|
" embed = np.load(embed_path)\n",
|
|
" embeds.append(embed)\n",
|
|
" labels.append(str(speaker_num))\n",
|
|
" locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n",
|
|
"embeds = np.concatenate(embeds)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Load embeddings with UMAP"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model = umap.UMAP()\n",
|
|
"projection = model.fit_transform(embeds)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Interactively charting the data in Bokeh\n",
|
|
"\n",
|
|
"Set up various details for Bokeh to plot the data\n",
|
|
"\n",
|
|
"You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n",
|
|
"\n",
|
|
"Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n",
|
|
"\n",
|
|
"File location in the tooltip is given relative to **AUDIO_PATH**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"source_wav_stems = ColumnDataSource(\n",
|
|
" data=dict(\n",
|
|
" x = projection.T[0].tolist(),\n",
|
|
" y = projection.T[1].tolist(),\n",
|
|
" desc=locations,\n",
|
|
" label=labels\n",
|
|
" )\n",
|
|
" )\n",
|
|
"\n",
|
|
"hover = HoverTool(\n",
|
|
" tooltips=[\n",
|
|
" (\"file\", \"@desc\"),\n",
|
|
" (\"speaker\", \"@label\"),\n",
|
|
" ]\n",
|
|
" )\n",
|
|
"\n",
|
|
"# optionally consider adding these to the tooltips if you want additional detail\n",
|
|
"# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n",
|
|
"# for the index of the embedding / wav file: (\"index\", \"$index\"),\n",
|
|
"\n",
|
|
"factors = list(set(labels))\n",
|
|
"pal_size = max(len(factors), 3)\n",
|
|
"pal = Category10[pal_size]\n",
|
|
"\n",
|
|
"p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n",
|
|
"\n",
|
|
"\n",
|
|
"p.circle('x', 'y', source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n",
|
|
"\n",
|
|
"url = \"http://localhost:8000/@desc\"\n",
|
|
"taptool = p.select(type=TapTool)\n",
|
|
"taptool.callback = OpenURL(url=url)\n",
|
|
"\n",
|
|
"show(p)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Local server to serve wav files from corpus\n",
|
|
"\n",
|
|
"This is required so that when you click on a data point the hyperlink associated with it will be served the file locally.\n",
|
|
"\n",
|
|
"There are other ways to serve this if you prefer and you can also run the commands manually on the command line\n",
|
|
"\n",
|
|
"The server will continue to run until stopped. To stop it simply interupt the kernel (ie square button or under Kernel menu)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%cd $AUDIO_PATH\n",
|
|
"%pwd\n",
|
|
"!python -m http.server"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|