Replace pyworld by pyin (#1946)

* Replace pyworld by pyin * Fix unit tests
2022-09-09 05:43:14 -03:00 · 2022-09-09 05:43:14 -03:00 · 371772c355
parent 4546b4cbd8
commit 371772c355
6 changed files with 61 additions and 25 deletions
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@ -62,7 +62,7 @@ class BaseAudioConfig(Coqpit):
            Maximum frequency of the F0 frames. Defaults to ```640```.

        pitch_fmin (float, optional):
-            Minimum frequency of the F0 frames. Defaults to ```0```.
+            Minimum frequency of the F0 frames. Defaults to ```1```.

        trim_db (int):
            Silence threshold used for silence trimming. Defaults to 45.
@ -144,7 +144,7 @@ class BaseAudioConfig(Coqpit):
    do_amp_to_db_mel: bool = True
    # f0 params
    pitch_fmax: float = 640.0
-    pitch_fmin: float = 0.0
+    pitch_fmin: float = 1.0
    # normalization params
    signal_norm: bool = True
    min_level_db: int = -100
--- a/TTS/utils/audio/numpy_transforms.py
+++ b/TTS/utils/audio/numpy_transforms.py
@ -2,9 +2,9 @@ from typing import Tuple

 import librosa
 import numpy as np
-import pyworld as pw
 import scipy
 import soundfile as sf
+from librosa import pyin

 # For using kwargs
 # pylint: disable=unused-argument
@ -242,12 +242,28 @@ def compute_stft_paddings(


 def compute_f0(
-    *, x: np.ndarray = None, pitch_fmax: float = None, hop_length: int = None, sample_rate: int = None, **kwargs
+    *,
+    x: np.ndarray = None,
+    pitch_fmax: float = None,
+    pitch_fmin: float = None,
+    hop_length: int = None,
+    win_length: int = None,
+    sample_rate: int = None,
+    stft_pad_mode: str = "reflect",
+    center: bool = True,
+    **kwargs,
 ) -> np.ndarray:
    """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.

    Args:
        x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
+        pitch_fmax (float): Pitch max value.
+        pitch_fmin (float): Pitch min value.
+        hop_length (int): Number of frames between STFT columns.
+        win_length (int): STFT window length.
+        sample_rate (int): Audio sampling rate.
+        stft_pad_mode (str): Padding mode for STFT.
+        center (bool): Centered padding.

    Returns:
        np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length`
@ -255,20 +271,35 @@ def compute_f0(
    Examples:
        >>> WAV_FILE = filename = librosa.util.example_audio_file()
        >>> from TTS.config import BaseAudioConfig
-        >>> from TTS.utils.audio.processor import AudioProcessor        >>> conf = BaseAudioConfig(pitch_fmax=8000)
+        >>> from TTS.utils.audio import AudioProcessor
+        >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
        >>> ap = AudioProcessor(**conf)
-        >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+        >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
        >>> pitch = ap.compute_f0(wav)
    """
    assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
+    assert pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."

-    f0, t = pw.dio(
-        x.astype(np.double),
-        fs=sample_rate,
-        f0_ceil=pitch_fmax,
-        frame_period=1000 * hop_length / sample_rate,
+    f0, voiced_mask, _ = pyin(
+        y=x.astype(np.double),
+        fmin=pitch_fmin,
+        fmax=pitch_fmax,
+        sr=sample_rate,
+        frame_length=win_length,
+        win_length=win_length // 2,
+        hop_length=hop_length,
+        pad_mode=stft_pad_mode,
+        center=center,
+        n_thresholds=100,
+        beta_parameters=(2, 18),
+        boltzmann_parameter=2,
+        resolution=0.1,
+        max_transition_rate=35.92,
+        switch_prob=0.01,
+        no_trough_prob=0.01,
    )
-    f0 = pw.stonemask(x.astype(np.double), f0, t, sample_rate)
+    f0[~voiced_mask] = 0.0
+
    return f0


--- a/TTS/utils/audio/processor.py
+++ b/TTS/utils/audio/processor.py
@ -2,12 +2,12 @@ from typing import Dict, Tuple

 import librosa
 import numpy as np
-import pyworld as pw
 import scipy.io.wavfile
 import scipy.signal
 import soundfile as sf

 from TTS.tts.utils.helpers import StandardScaler
+from TTS.utils.audio.numpy_transforms import compute_f0

 # pylint: disable=too-many-public-methods

@ -573,23 +573,28 @@ class AudioProcessor(object):
            >>> WAV_FILE = filename = librosa.util.example_audio_file()
            >>> from TTS.config import BaseAudioConfig
            >>> from TTS.utils.audio import AudioProcessor
-            >>> conf = BaseAudioConfig(pitch_fmax=8000)
+            >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
            >>> ap = AudioProcessor(**conf)
-            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+            >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
            >>> pitch = ap.compute_f0(wav)
        """
        assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
+        assert self.pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
        # align F0 length to the spectrogram length
        if len(x) % self.hop_length == 0:
-            x = np.pad(x, (0, self.hop_length // 2), mode="reflect")
+            x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode)

-        f0, t = pw.dio(
-            x.astype(np.double),
-            fs=self.sample_rate,
-            f0_ceil=self.pitch_fmax,
-            frame_period=1000 * self.hop_length / self.sample_rate,
+        f0 = compute_f0(
+            x=x,
+            pitch_fmax=self.pitch_fmax,
+            pitch_fmin=self.pitch_fmin,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            sample_rate=self.sample_rate,
+            stft_pad_mode=self.stft_pad_mode,
+            center=True,
        )
-        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
+
        return f0

    ### Audio Processing ###
--- a/requirements.txt
+++ b/requirements.txt
@ -23,7 +23,6 @@ umap-learn==0.5.1
 pandas
 # deps for training
 matplotlib
-pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
 # coqui stack
 trainer
 # config management
--- a/tests/aux_tests/test_audio_processor.py
+++ b/tests/aux_tests/test_audio_processor.py
@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")

 os.makedirs(OUT_PATH, exist_ok=True)
-conf = BaseAudioConfig(mel_fmax=8000)
+conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1)


 # pylint: disable=protected-access
--- a/tests/aux_tests/test_numpy_transforms.py
+++ b/tests/aux_tests/test_numpy_transforms.py
@ -31,7 +31,8 @@ class TestNumpyTransforms(unittest.TestCase):
            mel_fmin: int = 0
            hop_length: int = 256
            win_length: int = 1024
-            pitch_fmax: int = 450
+            pitch_fmax: int = 640
+            pitch_fmin: int = 1
            trim_db: int = -1
            min_silence_sec: float = 0.01
            gain: float = 1.0