Compute F0 using librosa

pull/792/head
Eren Gölge 2021-07-06 09:48:00 +02:00
parent 165e5814af
commit fba257104d
3 changed files with 56 additions and 11 deletions

View File

@ -22,6 +22,7 @@ class TTSDataset(Dataset):
compute_linear_spec: bool,
ap: AudioProcessor,
meta_data: List[List],
compute_f0: bool = False,
characters: Dict = None,
custom_symbols: List = None,
add_blank: bool = False,
@ -54,6 +55,8 @@ class TTSDataset(Dataset):
meta_data (list): List of dataset instances.
compute_f0 (bool): compute f0 if True. Defaults to False.
characters (dict): `dict` of custom text characters used for converting texts to sequences.
custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own
@ -103,6 +106,7 @@ class TTSDataset(Dataset):
self.cleaners = text_cleaner
self.compute_linear_spec = compute_linear_spec
self.return_wav = return_wav
self.compute_f0 = compute_f0
self.min_seq_len = min_seq_len
self.max_seq_len = max_seq_len
self.ap = ap
@ -458,6 +462,16 @@ class TTSDataset(Dataset):
wav_padded[i, :, : w.shape[0]] = torch.from_numpy(w)
wav_padded.transpose_(1, 2)
# compute f0
# TODO: compare perf in collate_fn vs in load_data
pitch = None
if self.compute_f0:
pitch = [self.ap.compute_f0(w).astype("float32") for w in wav]
pitch = prepare_tensor(pitch, self.outputs_per_step)
pitch = pitch.transpose(0, 2, 1)
assert mel.shape[1] == pitch.shape[1]
pitch = torch.FloatTensor(pitch).contiguous()
# collate attention alignments
if batch[0]["attn"] is not None:
attns = [batch[idx]["attn"].T for idx in ids_sorted_decreasing]

View File

@ -623,17 +623,41 @@ class AudioProcessor(object):
return 0, pad
return pad // 2, pad // 2 + pad % 2
### Compute F0 ###
# TODO: pw causes some dep issues
# def compute_f0(self, x):
# f0, t = pw.dio(
# x.astype(np.double),
# fs=self.sample_rate,
# f0_ceil=self.mel_fmax,
# frame_period=1000 * self.hop_length / self.sample_rate,
# )
# f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
# return f0
def compute_f0(self, x: np.ndarray) -> np.ndarray:
"""Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
Args:
x (np.ndarray): Waveform.
Returns:
np.ndarray: Pitch.
Examples:
>>> WAV_FILE = filename = librosa.util.example_audio_file()
>>> from TTS.config import BaseAudioConfig
>>> from TTS.utils.audio import AudioProcessor
>>> conf = BaseAudioConfig(mel_fmax=8000)
>>> ap = AudioProcessor(**conf)
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
>>> pitch = ap.compute_f0(wav)
"""
# f0, t = pw.dio(
# x.astype(np.double),
# fs=self.sample_rate,
# f0_ceil=self.mel_fmax,
# frame_period=1000 * self.hop_length / self.sample_rate,
# )
# f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
# f0 = compute_yin(, self.sample_rate, self.hop_length, self.fft_size)
f0, _, _ = librosa.pyin(
x.astype(np.double),
fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
fmax=self.mel_fmax,
frame_length=self.win_length,
sr=self.sample_rate,
fill_na=0.0,
)
return f0
### Audio Processing ###
def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int:

View File

@ -181,3 +181,10 @@ class TestAudio(unittest.TestCase):
mel_norm = ap.melspectrogram(wav)
mel_denorm = ap.denormalize(mel_norm)
assert abs(mel_reference - mel_denorm).max() < 1e-4
def test_compute_f0(self):
ap = AudioProcessor(**conf)
wav = ap.load_wav(WAV_FILE)
pitch = ap.compute_f0(wav)
mel = ap.melspectrogram(wav)
assert pitch.shape[0] == mel.shape[1]