diff --git a/config.json b/config.json index 71ba261e..89266a94 100644 --- a/config.json +++ b/config.json @@ -24,6 +24,7 @@ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. }, // DISTRIBUTED TRAINING diff --git a/utils/audio.py b/utils/audio.py index 82e5aa47..7b2c4834 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -24,6 +24,7 @@ class AudioProcessor(object): clip_norm=True, griffin_lim_iters=None, do_trim_silence=False, + trim_db=60, sound_norm=False, **_): @@ -46,6 +47,7 @@ class AudioProcessor(object): self.max_norm = 1.0 if max_norm is None else float(max_norm) self.clip_norm = clip_norm self.do_trim_silence = do_trim_silence + self.trim_db = trim_db self.sound_norm = sound_norm self.n_fft, self.hop_length, self.win_length = self._stft_parameters() assert min_level_db != 0.0, " [!] min_level_db is 0" @@ -217,7 +219,7 @@ class AudioProcessor(object): margin = int(self.sample_rate * 0.01) wav = wav[margin:-margin] return librosa.effects.trim( - wav, top_db=40, frame_length=self.win_length, hop_length=self.hop_length)[0] + wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0] @staticmethod def mulaw_encode(wav, qc):