Fixed bug related to yourtts speaker embeddings issue (#2234)

* Fixed bug related to yourtts speaker embeddings issue * Reverted code for base_tts * Bug fix on VITS d_vector_file type * Ignore the test speakers on YourTTS recipe * Add speaker encoder model and config on YourTTS recipe to easily do zero-shot inference * Update YourTTS config file * Update ModelManager._update_path to deal with list attributes * Fix lint checks * Remove unused code * Fix unit tests * Reset name_to_id to get the right speaker ids on load_embeddings_from_list_of_files * Set weighted_sampler_multipliers as an empty dict to prevent users' mistakes Co-authored-by: Edresson Casanova <edresson1@gmail.com>
2023-01-02 18:20:02 +05:00 · 2023-01-02 18:20:02 +05:00 · 42afad5e79
parent da93d768b8
commit 42afad5e79
10 changed files with 46 additions and 20 deletions
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -4,7 +4,7 @@
            "multi-dataset":{
                "your_tts":{
                    "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
                    "default_vocoder": null,
                    "commit": "e9a1953e",
                    "license": "CC BY-NC-ND 4.0",
--- a/TTS/tts/configs/vits_config.py
+++ b/TTS/tts/configs/vits_config.py
@ -167,7 +167,7 @@ class VitsConfig(BaseTTSConfig):

    # use d-vectors
    use_d_vector_file: bool = False
-    d_vector_file: str = None
+    d_vector_file: List[str] = None
    d_vector_dim: int = None

    def __post_init__(self):
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -477,8 +477,8 @@ class VitsArgs(Coqpit):
        use_d_vector_file (bool):
            Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.

-        d_vector_file (str):
-            Path to the file including pre-computed speaker embeddings. Defaults to None.
+        d_vector_file (List[str]):
+            List of paths to the files including pre-computed speaker embeddings. Defaults to None.

        d_vector_dim (int):
            Number of d-vector channels. Defaults to 0.
@ -573,7 +573,7 @@ class VitsArgs(Coqpit):
    use_speaker_embedding: bool = False
    num_speakers: int = 0
    speakers_file: str = None
-    d_vector_file: str = None
+    d_vector_file: List[str] = None
    speaker_embedding_channels: int = 256
    use_d_vector_file: bool = False
    d_vector_dim: int = 0
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@ -235,6 +235,9 @@ class EmbeddingManager(BaseIDManager):
            self.embeddings_by_names.update(embeddings_by_names)
            self.embeddings.update(embeddings)

+        # reset name_to_id to get the right speaker ids
+        self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)}
+
    def get_embedding_by_clip(self, clip_idx: str) -> List:
        """Get embedding by clip ID.

--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -109,10 +109,6 @@ class SpeakerManager(EmbeddingManager):

        if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
            speaker_manager = SpeakerManager()
-            if get_from_config_or_model_args_with_default(config, "speakers_file", None):
-                speaker_manager = SpeakerManager(
-                    d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
-                )
            if get_from_config_or_model_args_with_default(config, "d_vector_file", None):
                speaker_manager = SpeakerManager(
                    d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None)
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -339,10 +339,18 @@ class ModelManager(object):
                        sub_conf = sub_conf[fd]
                    else:
                        return
-                sub_conf[field_names[-1]] = new_path
+                if isinstance(sub_conf[field_names[-1]], list):
+                    sub_conf[field_names[-1]] = [new_path]
+                else:
+                    sub_conf[field_names[-1]] = new_path
            else:
                # field name points to a top-level field
-                config[field_name] = new_path
+                if not field_name in config:
+                    return
+                if isinstance(config[field_name], list):
+                    config[field_name] = [new_path]
+                else:
+                    config[field_name] = new_path
            config.save_json(config_path)

    @staticmethod
--- a/recipes/vctk/yourtts/train_yourtts.py
+++ b/recipes/vctk/yourtts/train_yourtts.py
@ -57,7 +57,25 @@ if not os.path.exists(VCTK_DOWNLOAD_PATH):

 # init configs
 vctk_config = BaseDatasetConfig(
-    formatter="vctk", dataset_name="vctk", meta_file_train="", meta_file_val="", path=VCTK_DOWNLOAD_PATH, language="en"
+    formatter="vctk",
+    dataset_name="vctk",
+    meta_file_train="",
+    meta_file_val="",
+    path=VCTK_DOWNLOAD_PATH,
+    language="en",
+    ignored_speakers=[
+        "p261",
+        "p225",
+        "p294",
+        "p347",
+        "p238",
+        "p234",
+        "p248",
+        "p335",
+        "p245",
+        "p326",
+        "p302",
+    ],  # Ignore the test speakers to full replicate the paper experiment
 )

 # Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :)
@ -111,11 +129,11 @@ model_args = VitsArgs(
    use_d_vector_file=True,
    d_vector_dim=512,
    num_layers_text_encoder=10,
+    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
+    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
    resblock_type_decoder="2",  # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
    # Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper
    # use_speaker_encoder_as_loss=True,
-    # speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
-    # speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
    # Usefull parameters to the enable multilingual training
    # use_language_embedding=True,
    # embedded_language_dim=4,
@ -207,6 +225,7 @@ config = VitsConfig(
    use_weighted_sampler=True,
    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
    weighted_sampler_attrs={"speaker_name": 1.0},
+    weighted_sampler_multipliers={},
    # It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
    speaker_encoder_loss_alpha=9.0,
 )
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@ -210,7 +210,7 @@ class TestVits(unittest.TestCase):
            num_chars=32,
            use_d_vector_file=True,
            d_vector_dim=256,
-            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+            d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
        )
        config = VitsConfig(model_args=args)
        model = Vits.init_from_config(config, verbose=False).to(device)
@ -355,7 +355,7 @@ class TestVits(unittest.TestCase):
            num_chars=32,
            use_d_vector_file=True,
            d_vector_dim=256,
-            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+            d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
        )
        config = VitsConfig(model_args=args)
        model = Vits.init_from_config(config, verbose=False).to(device)
@ -587,7 +587,7 @@ class TestVits(unittest.TestCase):
                num_chars=32,
                use_d_vector_file=True,
                d_vector_dim=256,
-                d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+                d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
            )
        )
        model = Vits.init_from_config(config, verbose=False).to(device)
--- a/tests/tts_tests/test_vits_d-vectors_train.py
+++ b/tests/tts_tests/test_vits_d-vectors_train.py
@ -33,7 +33,7 @@ config.audio.trim_db = 60

 # active multispeaker d-vec mode
 config.model_args.use_d_vector_file = True
-config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
+config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
 config.model_args.d_vector_dim = 256


--- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
+++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
@ -63,8 +63,8 @@ config.use_speaker_embedding = False
 # active multispeaker d-vec mode
 config.model_args.use_d_vector_file = True
 config.use_d_vector_file = True
-config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
-config.d_vector_file = "tests/data/ljspeech/speakers.json"
+config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
+config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
 config.model_args.d_vector_dim = 256
 config.d_vector_dim = 256