Merge pull request #519 from mueller91/dev

Speaker Encoder: New Datasets + DataLoader optimized
pull/10/head
Eren Gölge 2020-09-21 12:48:42 +02:00 committed by GitHub
commit c514628d02
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 227 additions and 74 deletions

View File

@ -42,8 +42,12 @@ def setup_loader(ap, is_val=False, verbose=False):
dataset = MyDataset(ap, dataset = MyDataset(ap,
meta_data_eval if is_val else meta_data_train, meta_data_eval if is_val else meta_data_train,
voice_len=1.6, voice_len=1.6,
num_utter_per_speaker=10, num_utter_per_speaker=c.num_utters_per_speaker,
num_speakers_in_batch=c.num_speakers_in_batch,
skip_speakers=False, skip_speakers=False,
storage_size=c.storage["storage_size"],
sample_from_storage_p=c.storage["sample_from_storage_p"],
additive_noise=c.storage["additive_noise"],
verbose=verbose) verbose=verbose)
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(dataset, loader = DataLoader(dataset,
@ -60,6 +64,7 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
epoch_time = 0 epoch_time = 0
best_loss = float('inf') best_loss = float('inf')
avg_loss = 0 avg_loss = 0
avg_loader_time = 0
end_time = time.time() end_time = time.time()
for _, data in enumerate(data_loader): for _, data in enumerate(data_loader):
start_time = time.time() start_time = time.time()
@ -93,8 +98,11 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
step_time = time.time() - start_time step_time = time.time() - start_time
epoch_time += step_time epoch_time += step_time
avg_loss = 0.01 * loss.item( # Averaged Loss and Averaged Loader Time
) + 0.99 * avg_loss if avg_loss != 0 else loss.item() avg_loss = 0.01 * loss.item() \
+ 0.99 * avg_loss if avg_loss != 0 else loss.item()
avg_loader_time = 1/c.num_loader_workers * loader_time + \
(c.num_loader_workers-1) / c.num_loader_workers * avg_loader_time if avg_loader_time != 0 else loader_time
current_lr = optimizer.param_groups[0]['lr'] current_lr = optimizer.param_groups[0]['lr']
if global_step % c.steps_plot_stats == 0: if global_step % c.steps_plot_stats == 0:
@ -103,7 +111,8 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
"loss": avg_loss, "loss": avg_loss,
"lr": current_lr, "lr": current_lr,
"grad_norm": grad_norm, "grad_norm": grad_norm,
"step_time": step_time "step_time": step_time,
"avg_loader_time": avg_loader_time
} }
tb_logger.tb_train_epoch_stats(global_step, train_stats) tb_logger.tb_train_epoch_stats(global_step, train_stats)
figures = { figures = {
@ -116,9 +125,9 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
if global_step % c.print_step == 0: if global_step % c.print_step == 0:
print( print(
" | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} " " | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} "
"StepTime:{:.2f} LoaderTime:{:.2f} LR:{:.6f}".format( "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
global_step, loss.item(), avg_loss, grad_norm, step_time, global_step, loss.item(), avg_loss, grad_norm, step_time,
loader_time, current_lr), loader_time, avg_loader_time, current_lr),
flush=True) flush=True)
# save best model # save best model

View File

@ -1,6 +1,6 @@
{ {
"run_name": "Model compatible to CorentinJ/Real-Time-Voice-Cloning", "run_name": "mueller91",
"run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ", "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
"audio":{ "audio":{
// Audio processing parameters // Audio processing parameters
@ -23,11 +23,11 @@
"clip_norm": true, // clip normalized values into the range. "clip_norm": true, // clip normalized values into the range.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"trim_db": 60 // threshold for timming silence. Set this according to your dataset. "trim_db": 60 // threshold for timming silence. Set this according to your dataset.
}, },
"reinit_layers": [], "reinit_layers": [],
"loss": "ge2e", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
"grad_clip": 3.0, // upper limit for gradients for clipping. "grad_clip": 3.0, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train. "epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
@ -35,27 +35,69 @@
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"steps_plot_stats": 10, // number of steps to plot embeddings. "steps_plot_stats": 10, // number of steps to plot embeddings.
"num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_utters_per_speaker": 10, //
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"wd": 0.000001, // Weight decay weight. "wd": 0.000001, // Weight decay weight.
"checkpoint": true, // If true, it saves checkpoints per "save_step" "checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
"print_step": 1, // Number of steps to log traning on console. "print_step": 20, // Number of steps to log traning on console.
"output_path": "../../checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
"model": { "model": {
"input_dim": 40, "input_dim": 40,
"proj_dim": 256, "proj_dim": 256,
"lstm_dim": 256, "lstm_dim": 768,
"num_lstm_layers": 3, "num_lstm_layers": 3,
"use_lstm_with_projection": false "use_lstm_with_projection": true
},
"storage": {
"sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage
"storage_size": 15, // the size of the in-memory storage with respect to a single batch
"additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness
}, },
"datasets": "datasets":
[ [
{ {
"name": "vctk", "name": "vctk_slim",
"path": "../../../datasets/VCTK-Corpus-removed-silence/", "path": "../../../audio-datasets/en/VCTK-Corpus/",
"meta_file_train": null, "meta_file_train": null,
"meta_file_val": null "meta_file_val": null
},
{
"name": "libri_tts",
"path": "../../../audio-datasets/en/LibriTTS/train-clean-100",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "libri_tts",
"path": "../../../audio-datasets/en/LibriTTS/train-clean-360",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "libri_tts",
"path": "../../../audio-datasets/en/LibriTTS/train-other-500",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "voxceleb1",
"path": "../../../audio-datasets/en/voxceleb1/",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "voxceleb2",
"path": "../../../audio-datasets/en/voxceleb2/",
"meta_file_train": null,
"meta_file_val": null
},
{
"name": "common_voice",
"path": "../../../audio-datasets/en/MozillaCommonVoice",
"meta_file_train": "train.tsv",
"meta_file_val": "test.tsv"
} }
] ]
} }

View File

@ -1,11 +1,15 @@
import numpy
import numpy as np import numpy as np
import queue
import torch import torch
import random import random
from torch.utils.data import Dataset from torch.utils.data import Dataset
from tqdm import tqdm
class MyDataset(Dataset): class MyDataset(Dataset):
def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64, def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64,
storage_size=1, sample_from_storage_p=0.5, additive_noise=0,
num_utter_per_speaker=10, skip_speakers=False, verbose=False): num_utter_per_speaker=10, skip_speakers=False, verbose=False):
""" """
Args: Args:
@ -24,8 +28,15 @@ class MyDataset(Dataset):
self.ap = ap self.ap = ap
self.verbose = verbose self.verbose = verbose
self.__parse_items() self.__parse_items()
self.storage = queue.Queue(maxsize=storage_size*num_speakers_in_batch)
self.sample_from_storage_p = float(sample_from_storage_p)
self.additive_noise = float(additive_noise)
if self.verbose: if self.verbose:
print("\n > DataLoader initialization") print("\n > DataLoader initialization")
print(f" | > Speakers per Batch: {num_speakers_in_batch}")
print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters")
print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}")
print(f" | > Noise added : {self.additive_noise}")
print(f" | > Number of instances : {len(self.items)}") print(f" | > Number of instances : {len(self.items)}")
print(f" | > Sequence length: {self.seq_len}") print(f" | > Sequence length: {self.seq_len}")
print(f" | > Num speakers: {len(self.speakers)}") print(f" | > Num speakers: {len(self.speakers)}")
@ -51,21 +62,37 @@ class MyDataset(Dataset):
return sample return sample
def __parse_items(self): def __parse_items(self):
"""
Find unique speaker ids and create a dict mapping utterances from speaker id
"""
speakers = list({item[-1] for item in self.items})
self.speaker_to_utters = {} self.speaker_to_utters = {}
self.speakers = [] for i in self.items:
for speaker in speakers: path_ = i[1]
speaker_utters = [item[1] for item in self.items if item[2] == speaker] speaker_ = i[2]
if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers: if speaker_ in self.speaker_to_utters.keys():
print( self.speaker_to_utters[speaker_].append(path_)
f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}."
)
else: else:
self.speakers.append(speaker) self.speaker_to_utters[speaker_] = [path_, ]
self.speaker_to_utters[speaker] = speaker_utters
if self.skip_speakers:
self.speaker_to_utters = {k: v for (k, v) in self.speaker_to_utters.items() if
len(v) >= self.num_utter_per_speaker}
self.speakers = [k for (k, v) in self.speaker_to_utters.items()]
# def __parse_items(self):
# """
# Find unique speaker ids and create a dict mapping utterances from speaker id
# """
# speakers = list({item[-1] for item in self.items})
# self.speaker_to_utters = {}
# self.speakers = []
# for speaker in speakers:
# speaker_utters = [item[1] for item in self.items if item[2] == speaker]
# if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers:
# print(
# f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}."
# )
# else:
# self.speakers.append(speaker)
# self.speaker_to_utters[speaker] = speaker_utters
def __len__(self): def __len__(self):
return int(1e10) return int(1e10)
@ -86,7 +113,7 @@ class MyDataset(Dataset):
""" """
Sample all M utterances for the given speaker. Sample all M utterances for the given speaker.
""" """
feats = [] wavs = []
labels = [] labels = []
for _ in range(self.num_utter_per_speaker): for _ in range(self.num_utter_per_speaker):
# TODO:dummy but works # TODO:dummy but works
@ -102,11 +129,9 @@ class MyDataset(Dataset):
break break
self.speaker_to_utters[speaker].remove(utter) self.speaker_to_utters[speaker].remove(utter)
offset = random.randint(0, wav.shape[0] - self.seq_len) wavs.append(wav)
mel = self.ap.melspectrogram(wav[offset : offset + self.seq_len])
feats.append(torch.FloatTensor(mel))
labels.append(speaker) labels.append(speaker)
return feats, labels return wavs, labels
def __getitem__(self, idx): def __getitem__(self, idx):
speaker, _ = self.__sample_speaker() speaker, _ = self.__sample_speaker()
@ -116,7 +141,28 @@ class MyDataset(Dataset):
labels = [] labels = []
feats = [] feats = []
for speaker in batch: for speaker in batch:
feats_, labels_ = self.__sample_speaker_utterances(speaker) if random.random() < self.sample_from_storage_p and self.storage.full():
# sample from storage (if full), ignoring the speaker
wavs_, labels_ = random.choice(self.storage.queue)
else:
# don't sample from storage, but from HDD
wavs_, labels_ = self.__sample_speaker_utterances(speaker)
# if storage is full, remove an item
if self.storage.full():
_ = self.storage.get_nowait()
# put the newly loaded item into storage
self.storage.put_nowait((wavs_, labels_))
# add random gaussian noise
if self.additive_noise > 0:
noises_ = [numpy.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_]
wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))]
# get a random subset of each of the wavs and convert to MFCC.
offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_]
mels_ = [self.ap.melspectrogram(wavs_[i][offsets_[i]: offsets_[i] + self.seq_len]) for i in range(len(wavs_))]
feats_ = [torch.FloatTensor(mel) for mel in mels_]
labels.append(labels_) labels.append(labels_)
feats.extend(feats_) feats.extend(feats_)
feats = torch.stack(feats) feats = torch.stack(feats)

View File

@ -23,7 +23,7 @@ def save_checkpoint(model, optimizer, model_loss, out_path,
def save_best_model(model, optimizer, model_loss, best_loss, out_path, def save_best_model(model, optimizer, model_loss, best_loss, out_path,
current_step): current_step):
if model_loss < best_loss: if model_loss < best_loss and current_step > 1000:
new_state_dict = model.state_dict() new_state_dict = model.state_dict()
state = { state = {
'model': new_state_dict, 'model': new_state_dict,
@ -35,7 +35,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
best_loss = model_loss best_loss = model_loss
bestmodel_path = 'best_model.pth.tar' bestmodel_path = 'best_model.pth.tar'
bestmodel_path = os.path.join(out_path, bestmodel_path) bestmodel_path = os.path.join(out_path, bestmodel_path)
print("\n > BEST MODEL ({0:.5f}) : {1:}".format( print("\n > NEW BEST MODEL ({0:.5f}) : {1:}".format(
model_loss, bestmodel_path)) model_loss, os.path.abspath(bestmodel_path)))
torch.save(state, bestmodel_path) torch.save(state, bestmodel_path)
return best_loss return best_loss

View File

@ -2,6 +2,10 @@ import os
from glob import glob from glob import glob
import re import re
import sys import sys
from pathlib import Path
from tqdm import tqdm
from TTS.tts.utils.generic_utils import split_dataset from TTS.tts.utils.generic_utils import split_dataset
@ -14,8 +18,8 @@ def load_meta_data(datasets):
meta_file_train = dataset['meta_file_train'] meta_file_train = dataset['meta_file_train']
meta_file_val = dataset['meta_file_val'] meta_file_val = dataset['meta_file_val']
preprocessor = get_preprocessor_by_name(name) preprocessor = get_preprocessor_by_name(name)
meta_data_train = preprocessor(root_path, meta_file_train) meta_data_train = preprocessor(root_path, meta_file_train)
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
if meta_file_val is None: if meta_file_val is None:
meta_data_eval, meta_data_train = split_dataset(meta_data_train) meta_data_eval, meta_data_train = split_dataset(meta_data_train)
else: else:
@ -167,8 +171,8 @@ def common_voice(root_path, meta_file):
cols = line.split("\t") cols = line.split("\t")
text = cols[2] text = cols[2]
speaker_name = cols[0] speaker_name = cols[0]
wav_file = os.path.join(root_path, "clips", cols[1] + ".wav") wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav"))
items.append([text, wav_file, speaker_name]) items.append([text, wav_file, 'MCV_' + speaker_name])
return items return items
@ -187,7 +191,7 @@ def libri_tts(root_path, meta_files=None):
cols = line.split('\t') cols = line.split('\t')
wav_file = os.path.join(_root_path, cols[0] + '.wav') wav_file = os.path.join(_root_path, cols[0] + '.wav')
text = cols[1] text = cols[1]
items.append([text, wav_file, speaker_name]) items.append([text, wav_file, 'LTTS_' + speaker_name])
for item in items: for item in items:
assert os.path.exists( assert os.path.exists(
item[1]), f" [!] wav files don't exist - {item[1]}" item[1]), f" [!] wav files don't exist - {item[1]}"
@ -235,8 +239,7 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'):
"""homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
test_speakers = meta_files test_speakers = meta_files
items = [] items = []
meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
recursive=True)
for meta_file in meta_files: for meta_file in meta_files:
_, speaker_id, txt_file = os.path.relpath(meta_file, _, speaker_id, txt_file = os.path.relpath(meta_file,
root_path).split(os.sep) root_path).split(os.sep)
@ -249,6 +252,70 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'):
text = file_text.readlines()[0] text = file_text.readlines()[0]
wav_file = os.path.join(root_path, wavs_path, speaker_id, wav_file = os.path.join(root_path, wavs_path, speaker_id,
file_id + '.wav') file_id + '.wav')
items.append([text, wav_file, speaker_id]) items.append([text, wav_file, 'VCTK_' + speaker_id])
return items return items
def vctk_slim(root_path, meta_files=None, wavs_path='wav48'):
"""homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
items = []
txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
for text_file in txt_files:
_, speaker_id, txt_file = os.path.relpath(text_file,
root_path).split(os.sep)
file_id = txt_file.split('.')[0]
if isinstance(meta_files, list): # if is list ignore this speakers ids
if speaker_id in meta_files:
continue
wav_file = os.path.join(root_path, wavs_path, speaker_id,
file_id + '.wav')
items.append([None, wav_file, 'VCTK_' + speaker_id])
return items
# ======================================== VOX CELEB ===========================================
def voxceleb2(root_path, meta_file=None):
"""
:param meta_file Used only for consistency with load_meta_data api
"""
return _voxcel_x(root_path, meta_file, voxcel_idx="2")
def voxceleb1(root_path, meta_file=None):
"""
:param meta_file Used only for consistency with load_meta_data api
"""
return _voxcel_x(root_path, meta_file, voxcel_idx="1")
def _voxcel_x(root_path, meta_file, voxcel_idx):
assert voxcel_idx in ["1", "2"]
expected_count = 148_000 if voxcel_idx == "1" else 1_000_000
voxceleb_path = Path(root_path)
cache_to = voxceleb_path / f"metafile_voxceleb{voxcel_idx}.csv"
cache_to.parent.mkdir(exist_ok=True)
# if not exists meta file, crawl recursively for 'wav' files
if meta_file is not None:
with open(str(meta_file), 'r') as f:
return [x.strip().split('|') for x in f.readlines()]
elif not cache_to.exists():
cnt = 0
meta_data = ""
wav_files = voxceleb_path.rglob("**/*.wav")
for path in tqdm(wav_files, desc=f"Building VoxCeleb {voxcel_idx} Meta file ... this needs to be done only once.",
total=expected_count):
speaker_id = str(Path(path).parent.parent.stem)
assert speaker_id.startswith('id')
text = None # VoxCel does not provide transciptions, and they are not needed for training the SE
meta_data += f"{text}|{path}|voxcel{voxcel_idx}_{speaker_id}\n"
cnt += 1
with open(str(cache_to), 'w') as f:
f.write(meta_data)
if cnt < expected_count:
raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}")
with open(str(cache_to), 'r') as f:
return [x.strip().split('|') for x in f.readlines()]

View File

@ -7,11 +7,9 @@ from TTS.utils.generic_utils import check_argument
def split_dataset(items): def split_dataset(items):
is_multi_speaker = False
speakers = [item[-1] for item in items] speakers = [item[-1] for item in items]
is_multi_speaker = len(set(speakers)) > 1 is_multi_speaker = len(set(speakers)) > 1
eval_split_size = 500 if len(items) * 0.01 > 500 else int( eval_split_size = min(500, int(len(items) * 0.01))
len(items) * 0.01)
assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples."
np.random.seed(0) np.random.seed(0)
np.random.shuffle(items) np.random.shuffle(items)
@ -142,6 +140,11 @@ def check_config(c):
check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
check_argument('trim_db', c['audio'], restricted=True, val_type=int) check_argument('trim_db', c['audio'], restricted=True, val_type=int)
# storage parameters
check_argument('sample_from_storage_p', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0)
check_argument('storage_size', c['storage'], restricted=True, val_type=int, min_val=1, max_val=100)
check_argument('additive_noise', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0)
# training parameters # training parameters
check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)

View File

@ -50,7 +50,7 @@ def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoc
if target_loss < best_loss: if target_loss < best_loss:
file_name = 'best_model.pth.tar' file_name = 'best_model.pth.tar'
checkpoint_path = os.path.join(output_folder, file_name) checkpoint_path = os.path.join(output_folder, file_name)
print(" > BEST MODEL : {}".format(checkpoint_path)) print(" >> BEST MODEL : {}".format(checkpoint_path))
save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs) save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs)
best_loss = target_loss best_loss = target_loss
return best_loss return best_loss

View File

@ -1,10 +1,6 @@
client_id path sentence up_votes down_votes age gender accent client_id path sentence up_votes down_votes age gender accent locale segment
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 21fce545b24d9a5af0403b949e95e8dd3c10c4ff3e371f14e4d5b4ebf588670b7c9e618285fc872d94a89ed7f0217d9019fe5de33f1577b49dcd518eacf63c4b Man sollte den Länderfinanzausgleich durch einen Bundesliga-Soli ersetzen. 2 0 fourties male germany 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005954.mp3 The applicants are invited for coffee and visa is given immediately. 3 0 en
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 42758baa4e91ef6b82b78b11a04bc5117a035a8d3bc42c33c0bb3084909af17043a194cfd8cd9839f0d6ef1ea5413acda5de5d1936abcc8ca073e2da7f9488ea Folgende Lektüre kann ich Ihnen zum Thema Kognitionspsychologie empfehlen. 2 0 fourties male germany 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005955.mp3 Developmental robotics is related to, but differs from, evolutionary robotics. 2 0 en
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 478f172c2dbda6675247e9674ade79a5b49efeefb7c9e99040dcc69a847a01d69398cf180570859b0cdb6fc887717e04cd8b149c723d48d00b5d18f41314667c Touristen winkten den Leuten am Ufer zu. 2 0 fourties male germany 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005956.mp3 The musical was originally directed and choreographed by Alan Lund. 2 0 en
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 4854368d6d21cb44103e432b5332f31e8d14030582a40850501bcf9377d699314a5ff27a8206fa89254ddde7f3f1c65d33836f3dfcfa16bcabec08537f2b5f08 Valentin hat das Handtuch geworfen. 2 0 fourties male germany 954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737073.mp3 He graduated from Columbia High School, in Brown County, South Dakota. 2 0 en
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 a841a9f3e032495dd47560e65fba99eeacb3618c07de8b1351c20188e5b71e33cc52f73315f721a3a24b65763c65bb52fbf3ae052eb5774e834dcb57f296db5c Ohne Gehörschutz bei der Arbeit wäre Klaus wohl nach zwei Wochen taub. 2 0 fourties male germany 954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737074.mp3 Competition for limited resources has also resulted in some local conflicts. 2 0 en
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 03ab970a5bf5410bc3260b073cce1c7f49c688ace83dc8836b1c0f79a09fea45a27725c769f4a9d2e6181defd016d22642789d7ac51da252b42958a9192bd4c7 Gerrit erinnerte sich daran, dass er einst einen Eid geschworen hatte. 2 0 fourties male germany
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 c4a94df443ad5f2c7241413ef7145d5f0de41ae929759073917fe96166da3c7d3a612c920ed7b0f3d5950a38d6205e9dba24af5bfb27e390a220d004e6e26744 Auf das, was jetzt kommt, habe ich nämlich absolut keinen Bock. 2 0 fourties male germany
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 104695983b1112229b4a48696405d044dad9ddef713aa6eb1a6240cc16b7b7a2a96354ae9da99783850dde08a982091e48d3037288a3a58269cac9fe70a6bd7a Von Salzburg ist es doch nicht weit bis zum Chiemsee. 2 0 fourties male germany
d5b5da343bb0f65e3580bc2e1902a4f5d004241488d751503f2020bc1c93f89715e355e35f6e25def2b90cb3eea99fda403eb92ae3afbb84d039a54a4ed2d875 ad2f69e053b0e20e01c82b9821fe5787f1cc8e4b0b97f0e4cab1e9a652c577169c8244fb222281a60ee3081854014113e04c4ca43643100b7c01dab0fac11974 Warum werden da keine strafrechtlichen Konsequenzen gezogen? 2 0 thirties male germany

1 client_id path sentence up_votes down_votes age gender accent locale segment
2 aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b 21fce545b24d9a5af0403b949e95e8dd3c10c4ff3e371f14e4d5b4ebf588670b7c9e618285fc872d94a89ed7f0217d9019fe5de33f1577b49dcd518eacf63c4b common_voice_en_20005954.mp3 Man sollte den Länderfinanzausgleich durch einen Bundesliga-Soli ersetzen. The applicants are invited for coffee and visa is given immediately. 2 3 0 fourties male germany en
3 aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b 42758baa4e91ef6b82b78b11a04bc5117a035a8d3bc42c33c0bb3084909af17043a194cfd8cd9839f0d6ef1ea5413acda5de5d1936abcc8ca073e2da7f9488ea common_voice_en_20005955.mp3 Folgende Lektüre kann ich Ihnen zum Thema Kognitionspsychologie empfehlen. Developmental robotics is related to, but differs from, evolutionary robotics. 2 0 fourties male germany en
4 aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b 478f172c2dbda6675247e9674ade79a5b49efeefb7c9e99040dcc69a847a01d69398cf180570859b0cdb6fc887717e04cd8b149c723d48d00b5d18f41314667c common_voice_en_20005956.mp3 Touristen winkten den Leuten am Ufer zu. The musical was originally directed and choreographed by Alan Lund. 2 0 fourties male germany en
5 aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 4854368d6d21cb44103e432b5332f31e8d14030582a40850501bcf9377d699314a5ff27a8206fa89254ddde7f3f1c65d33836f3dfcfa16bcabec08537f2b5f08 common_voice_en_19737073.mp3 Valentin hat das Handtuch geworfen. He graduated from Columbia High School, in Brown County, South Dakota. 2 0 fourties male germany en
6 aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 a841a9f3e032495dd47560e65fba99eeacb3618c07de8b1351c20188e5b71e33cc52f73315f721a3a24b65763c65bb52fbf3ae052eb5774e834dcb57f296db5c common_voice_en_19737074.mp3 Ohne Gehörschutz bei der Arbeit wäre Klaus wohl nach zwei Wochen taub. Competition for limited resources has also resulted in some local conflicts. 2 0 fourties male germany en
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 03ab970a5bf5410bc3260b073cce1c7f49c688ace83dc8836b1c0f79a09fea45a27725c769f4a9d2e6181defd016d22642789d7ac51da252b42958a9192bd4c7 Gerrit erinnerte sich daran, dass er einst einen Eid geschworen hatte. 2 0 fourties male germany
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 c4a94df443ad5f2c7241413ef7145d5f0de41ae929759073917fe96166da3c7d3a612c920ed7b0f3d5950a38d6205e9dba24af5bfb27e390a220d004e6e26744 Auf das, was jetzt kommt, habe ich nämlich absolut keinen Bock. 2 0 fourties male germany
aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 104695983b1112229b4a48696405d044dad9ddef713aa6eb1a6240cc16b7b7a2a96354ae9da99783850dde08a982091e48d3037288a3a58269cac9fe70a6bd7a Von Salzburg ist es doch nicht weit bis zum Chiemsee. 2 0 fourties male germany
d5b5da343bb0f65e3580bc2e1902a4f5d004241488d751503f2020bc1c93f89715e355e35f6e25def2b90cb3eea99fda403eb92ae3afbb84d039a54a4ed2d875 ad2f69e053b0e20e01c82b9821fe5787f1cc8e4b0b97f0e4cab1e9a652c577169c8244fb222281a60ee3081854014113e04c4ca43643100b7c01dab0fac11974 Warum werden da keine strafrechtlichen Konsequenzen gezogen? 2 0 thirties male germany

View File

@ -11,18 +11,8 @@ class TestPreprocessors(unittest.TestCase):
root_path = get_tests_input_path() root_path = get_tests_input_path()
meta_file = "common_voice.tsv" meta_file = "common_voice.tsv"
items = common_voice(root_path, meta_file) items = common_voice(root_path, meta_file)
assert items[0][0] == "Man sollte den Länderfinanzausgleich durch " \ assert items[0][0] == 'The applicants are invited for coffee and visa is given immediately.'
"einen Bundesliga-Soli ersetzen." assert items[0][1] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_20005954.wav")
assert items[0][1] == os.path.join(get_tests_input_path(), "clips",
"21fce545b24d9a5af0403b949e95e8dd3"
"c10c4ff3e371f14e4d5b4ebf588670b7c"
"9e618285fc872d94a89ed7f0217d9019f"
"e5de33f1577b49dcd518eacf63c4b.wav")
assert items[-1][0] == "Warum werden da keine strafrechtlichen " \ assert items[-1][0] == "Competition for limited resources has also resulted in some local conflicts."
"Konsequenzen gezogen?" assert items[-1][1] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_19737074.wav")
assert items[-1][1] == os.path.join(get_tests_input_path(), "clips",
"ad2f69e053b0e20e01c82b9821fe5787f1"
"cc8e4b0b97f0e4cab1e9a652c577169c82"
"44fb222281a60ee3081854014113e04c4c"
"a43643100b7c01dab0fac11974.wav")