Compute F0 using librosa

2021-07-06 09:48:00 +02:00 · 2021-07-06 09:48:00 +02:00 · fba257104d
parent 165e5814af
commit fba257104d
3 changed files with 56 additions and 11 deletions
--- a/TTS/tts/datasets/TTSDataset.py
+++ b/TTS/tts/datasets/TTSDataset.py
@ -22,6 +22,7 @@ class TTSDataset(Dataset):
        compute_linear_spec: bool,
        ap: AudioProcessor,
        meta_data: List[List],
+        compute_f0: bool = False,
        characters: Dict = None,
        custom_symbols: List = None,
        add_blank: bool = False,
@ -54,6 +55,8 @@ class TTSDataset(Dataset):

            meta_data (list): List of dataset instances.

+            compute_f0 (bool): compute f0 if True. Defaults to False.
+
            characters (dict): `dict` of custom text characters used for converting texts to sequences.

            custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own
@ -103,6 +106,7 @@ class TTSDataset(Dataset):
        self.cleaners = text_cleaner
        self.compute_linear_spec = compute_linear_spec
        self.return_wav = return_wav
+        self.compute_f0 = compute_f0
        self.min_seq_len = min_seq_len
        self.max_seq_len = max_seq_len
        self.ap = ap
@ -458,6 +462,16 @@ class TTSDataset(Dataset):
                    wav_padded[i, :, : w.shape[0]] = torch.from_numpy(w)
                wav_padded.transpose_(1, 2)

+            # compute f0
+            # TODO: compare perf in collate_fn vs in load_data
+            pitch = None
+            if self.compute_f0:
+                pitch = [self.ap.compute_f0(w).astype("float32") for w in wav]
+                pitch = prepare_tensor(pitch, self.outputs_per_step)
+                pitch = pitch.transpose(0, 2, 1)
+                assert mel.shape[1] == pitch.shape[1]
+                pitch = torch.FloatTensor(pitch).contiguous()
+
            # collate attention alignments
            if batch[0]["attn"] is not None:
                attns = [batch[idx]["attn"].T for idx in ids_sorted_decreasing]
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -623,17 +623,41 @@ class AudioProcessor(object):
            return 0, pad
        return pad // 2, pad // 2 + pad % 2

-    ### Compute F0 ###
-    # TODO: pw causes some dep issues
-    # def compute_f0(self, x):
-    #     f0, t = pw.dio(
-    #         x.astype(np.double),
-    #         fs=self.sample_rate,
-    #         f0_ceil=self.mel_fmax,
-    #         frame_period=1000 * self.hop_length / self.sample_rate,
-    #     )
-    #     f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
-    #     return f0
+    def compute_f0(self, x: np.ndarray) -> np.ndarray:
+        """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
+
+        Args:
+            x (np.ndarray): Waveform.
+
+        Returns:
+            np.ndarray: Pitch.
+
+        Examples:
+            >>> WAV_FILE = filename = librosa.util.example_audio_file()
+            >>> from TTS.config import BaseAudioConfig
+            >>> from TTS.utils.audio import AudioProcessor
+            >>> conf = BaseAudioConfig(mel_fmax=8000)
+            >>> ap = AudioProcessor(**conf)
+            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+            >>> pitch = ap.compute_f0(wav)
+        """
+        # f0, t = pw.dio(
+        #     x.astype(np.double),
+        #     fs=self.sample_rate,
+        #     f0_ceil=self.mel_fmax,
+        #     frame_period=1000 * self.hop_length / self.sample_rate,
+        # )
+        # f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
+        # f0 = compute_yin(, self.sample_rate, self.hop_length, self.fft_size)
+        f0, _, _ = librosa.pyin(
+            x.astype(np.double),
+            fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
+            fmax=self.mel_fmax,
+            frame_length=self.win_length,
+            sr=self.sample_rate,
+            fill_na=0.0,
+        )
+        return f0

    ### Audio Processing ###
    def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int:
--- a/tests/test_audio_processor.py
+++ b/tests/test_audio_processor.py
@ -181,3 +181,10 @@ class TestAudio(unittest.TestCase):
        mel_norm = ap.melspectrogram(wav)
        mel_denorm = ap.denormalize(mel_norm)
        assert abs(mel_reference - mel_denorm).max() < 1e-4
+
+    def test_compute_f0(self):
+        ap = AudioProcessor(**conf)
+        wav = ap.load_wav(WAV_FILE)
+        pitch = ap.compute_f0(wav)
+        mel = ap.melspectrogram(wav)
+        assert pitch.shape[0] == mel.shape[1]