From 94e8e0d416ae16e5e77535f9fe13780d5a344d78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 12 Jul 2021 12:29:02 +0200 Subject: [PATCH] Fix configs --- TTS/config/shared_configs.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index af054346..0de3795c 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -12,60 +12,89 @@ class BaseAudioConfig(Coqpit): Args: fft_size (int): Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024. + win_length (int): Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match ```fft_size```. Defaults to 1024. + hop_length (int): Number of audio samples between adjacent STFT columns. Defaults to 1024. + frame_shift_ms (int): Set ```hop_length``` based on milliseconds and sampling rate. + frame_length_ms (int): Set ```win_length``` based on milliseconds and sampling rate. + stft_pad_mode (str): Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'. + sample_rate (int): Audio sampling rate. Defaults to 22050. + resample (bool): Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```. + preemphasis (float): Preemphasis coefficient. Defaults to 0.0. + ref_level_db (int): 20 Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air. Defaults to 20. + do_sound_norm (bool): Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False. + + log_func (str): + Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'. + do_trim_silence (bool): Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```. + do_amp_to_db_linear (bool, optional): enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True. + do_amp_to_db_mel (bool, optional): enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True. + trim_db (int): Silence threshold used for silence trimming. Defaults to 45. + power (float): Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the artifacts in the synthesized voice. Defaults to 1.5. + griffin_lim_iters (int): Number of Griffing Lim iterations. Defaults to 60. + num_mels (int): Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80. + mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices. It needs to be adjusted for a dataset. Defaults to 0. + mel_fmax (float): Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset. + spec_gain (int): Gain applied when converting amplitude to DB. Defaults to 20. + signal_norm (bool): enable/disable signal normalization. Defaults to True. + min_level_db (int): minimum db threshold for the computed melspectrograms. Defaults to -100. + symmetric_norm (bool): enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to True. + max_norm (float): ```k``` defining the normalization range. Defaults to 4.0. + clip_norm (bool): enable/disable clipping the our of range values in the normalized audio signal. Defaults to True. + stats_path (str): Path to the computed stats file. Defaults to None. """ @@ -298,7 +327,7 @@ class BaseTrainingConfig(Coqpit): keep_all_best: bool = False keep_after: int = 10000 # dataloading - num_loader_workers: int = None + num_loader_workers: int = 0 num_eval_loader_workers: int = 0 use_noise_augment: bool = False # paths