mirror of https://github.com/coqui-ai/TTS.git
Compute F0 using librosa
parent
165e5814af
commit
fba257104d
|
@ -22,6 +22,7 @@ class TTSDataset(Dataset):
|
|||
compute_linear_spec: bool,
|
||||
ap: AudioProcessor,
|
||||
meta_data: List[List],
|
||||
compute_f0: bool = False,
|
||||
characters: Dict = None,
|
||||
custom_symbols: List = None,
|
||||
add_blank: bool = False,
|
||||
|
@ -54,6 +55,8 @@ class TTSDataset(Dataset):
|
|||
|
||||
meta_data (list): List of dataset instances.
|
||||
|
||||
compute_f0 (bool): compute f0 if True. Defaults to False.
|
||||
|
||||
characters (dict): `dict` of custom text characters used for converting texts to sequences.
|
||||
|
||||
custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own
|
||||
|
@ -103,6 +106,7 @@ class TTSDataset(Dataset):
|
|||
self.cleaners = text_cleaner
|
||||
self.compute_linear_spec = compute_linear_spec
|
||||
self.return_wav = return_wav
|
||||
self.compute_f0 = compute_f0
|
||||
self.min_seq_len = min_seq_len
|
||||
self.max_seq_len = max_seq_len
|
||||
self.ap = ap
|
||||
|
@ -458,6 +462,16 @@ class TTSDataset(Dataset):
|
|||
wav_padded[i, :, : w.shape[0]] = torch.from_numpy(w)
|
||||
wav_padded.transpose_(1, 2)
|
||||
|
||||
# compute f0
|
||||
# TODO: compare perf in collate_fn vs in load_data
|
||||
pitch = None
|
||||
if self.compute_f0:
|
||||
pitch = [self.ap.compute_f0(w).astype("float32") for w in wav]
|
||||
pitch = prepare_tensor(pitch, self.outputs_per_step)
|
||||
pitch = pitch.transpose(0, 2, 1)
|
||||
assert mel.shape[1] == pitch.shape[1]
|
||||
pitch = torch.FloatTensor(pitch).contiguous()
|
||||
|
||||
# collate attention alignments
|
||||
if batch[0]["attn"] is not None:
|
||||
attns = [batch[idx]["attn"].T for idx in ids_sorted_decreasing]
|
||||
|
|
|
@ -623,17 +623,41 @@ class AudioProcessor(object):
|
|||
return 0, pad
|
||||
return pad // 2, pad // 2 + pad % 2
|
||||
|
||||
### Compute F0 ###
|
||||
# TODO: pw causes some dep issues
|
||||
# def compute_f0(self, x):
|
||||
# f0, t = pw.dio(
|
||||
# x.astype(np.double),
|
||||
# fs=self.sample_rate,
|
||||
# f0_ceil=self.mel_fmax,
|
||||
# frame_period=1000 * self.hop_length / self.sample_rate,
|
||||
# )
|
||||
# f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
|
||||
# return f0
|
||||
def compute_f0(self, x: np.ndarray) -> np.ndarray:
|
||||
"""Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
|
||||
|
||||
Args:
|
||||
x (np.ndarray): Waveform.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Pitch.
|
||||
|
||||
Examples:
|
||||
>>> WAV_FILE = filename = librosa.util.example_audio_file()
|
||||
>>> from TTS.config import BaseAudioConfig
|
||||
>>> from TTS.utils.audio import AudioProcessor
|
||||
>>> conf = BaseAudioConfig(mel_fmax=8000)
|
||||
>>> ap = AudioProcessor(**conf)
|
||||
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
|
||||
>>> pitch = ap.compute_f0(wav)
|
||||
"""
|
||||
# f0, t = pw.dio(
|
||||
# x.astype(np.double),
|
||||
# fs=self.sample_rate,
|
||||
# f0_ceil=self.mel_fmax,
|
||||
# frame_period=1000 * self.hop_length / self.sample_rate,
|
||||
# )
|
||||
# f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
|
||||
# f0 = compute_yin(, self.sample_rate, self.hop_length, self.fft_size)
|
||||
f0, _, _ = librosa.pyin(
|
||||
x.astype(np.double),
|
||||
fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
|
||||
fmax=self.mel_fmax,
|
||||
frame_length=self.win_length,
|
||||
sr=self.sample_rate,
|
||||
fill_na=0.0,
|
||||
)
|
||||
return f0
|
||||
|
||||
### Audio Processing ###
|
||||
def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int:
|
||||
|
|
|
@ -181,3 +181,10 @@ class TestAudio(unittest.TestCase):
|
|||
mel_norm = ap.melspectrogram(wav)
|
||||
mel_denorm = ap.denormalize(mel_norm)
|
||||
assert abs(mel_reference - mel_denorm).max() < 1e-4
|
||||
|
||||
def test_compute_f0(self):
|
||||
ap = AudioProcessor(**conf)
|
||||
wav = ap.load_wav(WAV_FILE)
|
||||
pitch = ap.compute_f0(wav)
|
||||
mel = ap.melspectrogram(wav)
|
||||
assert pitch.shape[0] == mel.shape[1]
|
||||
|
|
Loading…
Reference in New Issue