mirror of https://github.com/coqui-ai/TTS.git
Fix configs
parent
0f19f8c911
commit
94e8e0d416
|
@ -12,60 +12,89 @@ class BaseAudioConfig(Coqpit):
|
|||
Args:
|
||||
fft_size (int):
|
||||
Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
|
||||
|
||||
win_length (int):
|
||||
Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
|
||||
```fft_size```. Defaults to 1024.
|
||||
|
||||
hop_length (int):
|
||||
Number of audio samples between adjacent STFT columns. Defaults to 1024.
|
||||
|
||||
frame_shift_ms (int):
|
||||
Set ```hop_length``` based on milliseconds and sampling rate.
|
||||
|
||||
frame_length_ms (int):
|
||||
Set ```win_length``` based on milliseconds and sampling rate.
|
||||
|
||||
stft_pad_mode (str):
|
||||
Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
|
||||
|
||||
sample_rate (int):
|
||||
Audio sampling rate. Defaults to 22050.
|
||||
|
||||
resample (bool):
|
||||
Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
|
||||
|
||||
preemphasis (float):
|
||||
Preemphasis coefficient. Defaults to 0.0.
|
||||
|
||||
ref_level_db (int): 20
|
||||
Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
|
||||
Defaults to 20.
|
||||
|
||||
do_sound_norm (bool):
|
||||
Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
|
||||
|
||||
log_func (str):
|
||||
Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
|
||||
|
||||
do_trim_silence (bool):
|
||||
Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
|
||||
|
||||
do_amp_to_db_linear (bool, optional):
|
||||
enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
|
||||
|
||||
do_amp_to_db_mel (bool, optional):
|
||||
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
|
||||
|
||||
trim_db (int):
|
||||
Silence threshold used for silence trimming. Defaults to 45.
|
||||
|
||||
power (float):
|
||||
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
|
||||
artifacts in the synthesized voice. Defaults to 1.5.
|
||||
|
||||
griffin_lim_iters (int):
|
||||
Number of Griffing Lim iterations. Defaults to 60.
|
||||
|
||||
num_mels (int):
|
||||
Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
|
||||
|
||||
mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
|
||||
It needs to be adjusted for a dataset. Defaults to 0.
|
||||
|
||||
mel_fmax (float):
|
||||
Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
|
||||
|
||||
spec_gain (int):
|
||||
Gain applied when converting amplitude to DB. Defaults to 20.
|
||||
|
||||
signal_norm (bool):
|
||||
enable/disable signal normalization. Defaults to True.
|
||||
|
||||
min_level_db (int):
|
||||
minimum db threshold for the computed melspectrograms. Defaults to -100.
|
||||
|
||||
symmetric_norm (bool):
|
||||
enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
|
||||
[0, k], Defaults to True.
|
||||
|
||||
max_norm (float):
|
||||
```k``` defining the normalization range. Defaults to 4.0.
|
||||
|
||||
clip_norm (bool):
|
||||
enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
|
||||
|
||||
stats_path (str):
|
||||
Path to the computed stats file. Defaults to None.
|
||||
"""
|
||||
|
@ -298,7 +327,7 @@ class BaseTrainingConfig(Coqpit):
|
|||
keep_all_best: bool = False
|
||||
keep_after: int = 10000
|
||||
# dataloading
|
||||
num_loader_workers: int = None
|
||||
num_loader_workers: int = 0
|
||||
num_eval_loader_workers: int = 0
|
||||
use_noise_augment: bool = False
|
||||
# paths
|
||||
|
|
Loading…
Reference in New Issue