set silence trimming threshold in config

2020-02-03 14:16:40 +01:00 · 2020-02-03 14:16:40 +01:00 · ffe9a32813
parent ca33336ae0
commit ffe9a32813
2 changed files with 4 additions and 1 deletions
--- a/config.json
+++ b/config.json
@ -24,6 +24,7 @@
        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
        "do_trim_silence": true  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
    },

    // DISTRIBUTED TRAINING
--- a/utils/audio.py
+++ b/utils/audio.py
@ -24,6 +24,7 @@ class AudioProcessor(object):
                 clip_norm=True,
                 griffin_lim_iters=None,
                 do_trim_silence=False,
+                 trim_db=60,
                 sound_norm=False,
                 **_):

@ -46,6 +47,7 @@ class AudioProcessor(object):
        self.max_norm = 1.0 if max_norm is None else float(max_norm)
        self.clip_norm = clip_norm
        self.do_trim_silence = do_trim_silence
+        self.trim_db = trim_db
        self.sound_norm = sound_norm
        self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
        assert min_level_db != 0.0, " [!] min_level_db is 0"
@ -217,7 +219,7 @@ class AudioProcessor(object):
        margin = int(self.sample_rate * 0.01)
        wav = wav[margin:-margin]
        return librosa.effects.trim(
-            wav, top_db=40, frame_length=self.win_length, hop_length=self.hop_length)[0]
+            wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0]

    @staticmethod
    def mulaw_encode(wav, qc):