mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'bokeh-interactive-1' into dev
commit
574de86b9b
|
@ -1,8 +1,12 @@
|
|||
### Speaker embedding (Experimental)
|
||||
|
||||
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding. So you can generate d-vectors for multi-speaker TTS or prune bad samples from your TTS dataset. Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook.
|
||||
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
|
||||
|
||||
![](https://user-images.githubusercontent.com/1402048/64603079-7fa5c100-d3c8-11e9-88e7-88a00d0e37d1.png)
|
||||
With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
|
||||
|
||||
Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
|
||||
|
||||
![](umap.png)
|
||||
|
||||
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
|
||||
|
||||
|
|
|
@ -14,27 +14,52 @@ parser = argparse.ArgumentParser(
|
|||
description="Compute embedding vectors for each wav file in a dataset. "
|
||||
)
|
||||
parser.add_argument(
|
||||
"model_path", type=str, help="Path to model outputs (checkpoint, tensorboard etc.)."
|
||||
)
|
||||
'data_path',
|
||||
type=str,
|
||||
help='Data path for wav files - directory or CSV file')
|
||||
parser.add_argument(
|
||||
"config_path", type=str, help="Path to config file for training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"data_path", type=str, help="Defines the data path. It overwrites config.json."
|
||||
)
|
||||
parser.add_argument("output_path", type=str, help="path for training outputs.")
|
||||
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=False)
|
||||
parser.add_argument(
|
||||
'--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
c = load_config(args.config_path)
|
||||
ap = AudioProcessor(**c["audio"])
|
||||
|
||||
wav_files = glob.glob(args.data_path + "/**/*.wav", recursive=True)
|
||||
output_files = [
|
||||
wav_file.replace(args.data_path, args.output_path).replace(".wav", ".npy")
|
||||
for wav_file in wav_files
|
||||
]
|
||||
data_path = args.data_path
|
||||
split_ext = os.path.splitext(data_path)
|
||||
sep = args.separator
|
||||
|
||||
if len(split_ext) > 0 and split_ext[1].lower() == '.csv':
|
||||
# Parse CSV
|
||||
print(f'CSV file: {data_path}')
|
||||
with open(data_path) as f:
|
||||
wav_path = os.path.join(os.path.dirname(data_path), 'wavs')
|
||||
wav_files = []
|
||||
print(f'Separator is: {sep}')
|
||||
for line in f:
|
||||
components = line.split(sep)
|
||||
if len(components) != 2:
|
||||
print("Invalid line")
|
||||
continue
|
||||
wav_file = os.path.join(wav_path, components[0] + '.wav')
|
||||
#print(f'wav_file: {wav_file}')
|
||||
if os.path.exists(wav_file):
|
||||
wav_files.append(wav_file)
|
||||
print(f'Count of wavs imported: {len(wav_files)}')
|
||||
else:
|
||||
# Parse all wav files in data_path
|
||||
wav_path = data_path
|
||||
wav_files = glob.glob(data_path + '/**/*.wav', recursive=True)
|
||||
|
||||
output_files = [wav_file.replace(wav_path, args.output_path).replace(
|
||||
'.wav', '.npy') for wav_file in wav_files]
|
||||
|
||||
for output_file in output_files:
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Before Width: | Height: | Size: 23 KiB After Width: | Height: | Size: 24 KiB |
Loading…
Reference in New Issue