Fix configs

pull/792/head
Eren Gölge 2021-07-12 12:29:02 +02:00
parent 0f19f8c911
commit 94e8e0d416
1 changed files with 30 additions and 1 deletions

View File

@ -12,60 +12,89 @@ class BaseAudioConfig(Coqpit):
Args:
fft_size (int):
Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
win_length (int):
Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
```fft_size```. Defaults to 1024.
hop_length (int):
Number of audio samples between adjacent STFT columns. Defaults to 1024.
frame_shift_ms (int):
Set ```hop_length``` based on milliseconds and sampling rate.
frame_length_ms (int):
Set ```win_length``` based on milliseconds and sampling rate.
stft_pad_mode (str):
Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
sample_rate (int):
Audio sampling rate. Defaults to 22050.
resample (bool):
Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
preemphasis (float):
Preemphasis coefficient. Defaults to 0.0.
ref_level_db (int): 20
Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
Defaults to 20.
do_sound_norm (bool):
Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
log_func (str):
Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
do_trim_silence (bool):
Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
do_amp_to_db_linear (bool, optional):
enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
do_amp_to_db_mel (bool, optional):
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
trim_db (int):
Silence threshold used for silence trimming. Defaults to 45.
power (float):
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
artifacts in the synthesized voice. Defaults to 1.5.
griffin_lim_iters (int):
Number of Griffing Lim iterations. Defaults to 60.
num_mels (int):
Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
It needs to be adjusted for a dataset. Defaults to 0.
mel_fmax (float):
Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
spec_gain (int):
Gain applied when converting amplitude to DB. Defaults to 20.
signal_norm (bool):
enable/disable signal normalization. Defaults to True.
min_level_db (int):
minimum db threshold for the computed melspectrograms. Defaults to -100.
symmetric_norm (bool):
enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
[0, k], Defaults to True.
max_norm (float):
```k``` defining the normalization range. Defaults to 4.0.
clip_norm (bool):
enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
stats_path (str):
Path to the computed stats file. Defaults to None.
"""
@ -298,7 +327,7 @@ class BaseTrainingConfig(Coqpit):
keep_all_best: bool = False
keep_after: int = 10000
# dataloading
num_loader_workers: int = None
num_loader_workers: int = 0
num_eval_loader_workers: int = 0
use_noise_augment: bool = False
# paths