trim silence if enabled

2018-11-23 16:58:26 +01:00 · 2018-11-23 16:58:26 +01:00 · 0f0bde935c
parent 22dcc4f7d0
commit 0f0bde935c
2 changed files with 13 additions and 1 deletions
--- a/config.json
+++ b/config.json
@ -21,7 +21,8 @@
        "max_norm": 1,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
        "clip_norm": true,      // clip normalized values into the range.
        "mel_fmin": null,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": null        // maximum freq level for mel-spec. Tune for dataset!!
+        "mel_fmax": null,        // maximum freq level for mel-spec. Tune for dataset!!
+        "do_trim_silence": true  // enable trimming of slience of audio as you load it.
    },

    "embedding_size": 256,    
--- a/utils/audio.py
+++ b/utils/audio.py
@ -26,6 +26,7 @@ class AudioProcessor(object):
                 mel_fmax=None,
                 clip_norm=True,
                 griffin_lim_iters=None,
+                 do_trim_silence=False
                 **kwargs):

        print(" > Setting up Audio Processor...")
@ -47,6 +48,7 @@ class AudioProcessor(object):
        self.mel_fmax = mel_fmax
        self.max_norm = 1.0 if max_norm is None else float(max_norm)
        self.clip_norm = clip_norm
+        self.do_trim_silence = do_trim_silence
        self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
        print(" | > Audio Processor attributes.")
        members = vars(self)
@ -203,6 +205,13 @@ class AudioProcessor(object):
                return x + hop_length
        return len(wav)

+    def trim_silence(self, wav):
+        """ Trim silent parts with a threshold and 0.1 sec margin """
+        margin = int(self.sample_rate * 0.1)
+        wav = wav[margin:-margin]
+        return librosa.effects.trim(
+            wav, top_db=40, frame_length=1024, hop_length=256)[0]
+
    # WaveRNN repo specific functions
    # def mulaw_encode(self, wav, qc):
    #     mu = qc - 1
@ -225,6 +234,8 @@ class AudioProcessor(object):

    def load_wav(self, filename, encode=False):
        x, sr = librosa.load(filename, sr=self.sample_rate)
+        if self.do_trim_silence:
+            x = self.ap.trim_silence(x)
        # sr, x = io.wavfile.read(filename)
        assert self.sample_rate == sr
        return x