Merge branch 'bokeh-interactive-1' into dev

pull/10/head
Eren Golge 2019-11-14 15:38:14 +01:00
commit 574de86b9b
4 changed files with 636 additions and 78 deletions

View File

@ -1,8 +1,12 @@
### Speaker embedding (Experimental)
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding. So you can generate d-vectors for multi-speaker TTS or prune bad samples from your TTS dataset. Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook.
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
![](https://user-images.githubusercontent.com/1402048/64603079-7fa5c100-d3c8-11e9-88e7-88a00d0e37d1.png)
With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
![](umap.png)
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.

View File

@ -14,27 +14,52 @@ parser = argparse.ArgumentParser(
description="Compute embedding vectors for each wav file in a dataset. "
)
parser.add_argument(
"model_path", type=str, help="Path to model outputs (checkpoint, tensorboard etc.)."
)
'data_path',
type=str,
help='Data path for wav files - directory or CSV file')
parser.add_argument(
"config_path", type=str, help="Path to config file for training.",
)
parser.add_argument(
"data_path", type=str, help="Defines the data path. It overwrites config.json."
)
parser.add_argument("output_path", type=str, help="path for training outputs.")
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=False)
parser.add_argument(
'--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|'
)
args = parser.parse_args()
c = load_config(args.config_path)
ap = AudioProcessor(**c["audio"])
wav_files = glob.glob(args.data_path + "/**/*.wav", recursive=True)
output_files = [
wav_file.replace(args.data_path, args.output_path).replace(".wav", ".npy")
for wav_file in wav_files
]
data_path = args.data_path
split_ext = os.path.splitext(data_path)
sep = args.separator
if len(split_ext) > 0 and split_ext[1].lower() == '.csv':
# Parse CSV
print(f'CSV file: {data_path}')
with open(data_path) as f:
wav_path = os.path.join(os.path.dirname(data_path), 'wavs')
wav_files = []
print(f'Separator is: {sep}')
for line in f:
components = line.split(sep)
if len(components) != 2:
print("Invalid line")
continue
wav_file = os.path.join(wav_path, components[0] + '.wav')
#print(f'wav_file: {wav_file}')
if os.path.exists(wav_file):
wav_files.append(wav_file)
print(f'Count of wavs imported: {len(wav_files)}')
else:
# Parse all wav files in data_path
wav_path = data_path
wav_files = glob.glob(data_path + '/**/*.wav', recursive=True)
output_files = [wav_file.replace(wav_path, args.output_path).replace(
'.wav', '.npy') for wav_file in wav_files]
for output_file in output_files:
os.makedirs(os.path.dirname(output_file), exist_ok=True)

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

After

Width:  |  Height:  |  Size: 24 KiB