Fixed bug related to yourtts speaker embeddings issue (#2234)

* Fixed bug related to yourtts speaker embeddings issue

* Reverted code for base_tts

* Bug fix on VITS d_vector_file type

* Ignore the test speakers on YourTTS recipe

* Add speaker encoder model and config on YourTTS recipe to easily do zero-shot inference

* Update YourTTS config file

* Update ModelManager._update_path to deal with list attributes

* Fix lint checks

* Remove unused code

* Fix unit tests

* Reset name_to_id to get the right speaker ids on load_embeddings_from_list_of_files

* Set weighted_sampler_multipliers as an empty dict to prevent users' mistakes

Co-authored-by: Edresson Casanova <edresson1@gmail.com>
pull/2257/head
Khalid Bashir 2023-01-02 18:20:02 +05:00 committed by GitHub
parent da93d768b8
commit 42afad5e79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 46 additions and 20 deletions

View File

@ -4,7 +4,7 @@
"multi-dataset":{
"your_tts":{
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
"default_vocoder": null,
"commit": "e9a1953e",
"license": "CC BY-NC-ND 4.0",

View File

@ -167,7 +167,7 @@ class VitsConfig(BaseTTSConfig):
# use d-vectors
use_d_vector_file: bool = False
d_vector_file: str = None
d_vector_file: List[str] = None
d_vector_dim: int = None
def __post_init__(self):

View File

@ -477,8 +477,8 @@ class VitsArgs(Coqpit):
use_d_vector_file (bool):
Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
d_vector_file (List[str]):
List of paths to the files including pre-computed speaker embeddings. Defaults to None.
d_vector_dim (int):
Number of d-vector channels. Defaults to 0.
@ -573,7 +573,7 @@ class VitsArgs(Coqpit):
use_speaker_embedding: bool = False
num_speakers: int = 0
speakers_file: str = None
d_vector_file: str = None
d_vector_file: List[str] = None
speaker_embedding_channels: int = 256
use_d_vector_file: bool = False
d_vector_dim: int = 0

View File

@ -235,6 +235,9 @@ class EmbeddingManager(BaseIDManager):
self.embeddings_by_names.update(embeddings_by_names)
self.embeddings.update(embeddings)
# reset name_to_id to get the right speaker ids
self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)}
def get_embedding_by_clip(self, clip_idx: str) -> List:
"""Get embedding by clip ID.

View File

@ -109,10 +109,6 @@ class SpeakerManager(EmbeddingManager):
if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
speaker_manager = SpeakerManager()
if get_from_config_or_model_args_with_default(config, "speakers_file", None):
speaker_manager = SpeakerManager(
d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
)
if get_from_config_or_model_args_with_default(config, "d_vector_file", None):
speaker_manager = SpeakerManager(
d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None)

View File

@ -339,10 +339,18 @@ class ModelManager(object):
sub_conf = sub_conf[fd]
else:
return
sub_conf[field_names[-1]] = new_path
if isinstance(sub_conf[field_names[-1]], list):
sub_conf[field_names[-1]] = [new_path]
else:
sub_conf[field_names[-1]] = new_path
else:
# field name points to a top-level field
config[field_name] = new_path
if not field_name in config:
return
if isinstance(config[field_name], list):
config[field_name] = [new_path]
else:
config[field_name] = new_path
config.save_json(config_path)
@staticmethod

View File

@ -57,7 +57,25 @@ if not os.path.exists(VCTK_DOWNLOAD_PATH):
# init configs
vctk_config = BaseDatasetConfig(
formatter="vctk", dataset_name="vctk", meta_file_train="", meta_file_val="", path=VCTK_DOWNLOAD_PATH, language="en"
formatter="vctk",
dataset_name="vctk",
meta_file_train="",
meta_file_val="",
path=VCTK_DOWNLOAD_PATH,
language="en",
ignored_speakers=[
"p261",
"p225",
"p294",
"p347",
"p238",
"p234",
"p248",
"p335",
"p245",
"p326",
"p302",
], # Ignore the test speakers to full replicate the paper experiment
)
# Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :)
@ -111,11 +129,11 @@ model_args = VitsArgs(
use_d_vector_file=True,
d_vector_dim=512,
num_layers_text_encoder=10,
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
resblock_type_decoder="2", # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
# Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper
# use_speaker_encoder_as_loss=True,
# speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
# speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
# Usefull parameters to the enable multilingual training
# use_language_embedding=True,
# embedded_language_dim=4,
@ -207,6 +225,7 @@ config = VitsConfig(
use_weighted_sampler=True,
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
weighted_sampler_attrs={"speaker_name": 1.0},
weighted_sampler_multipliers={},
# It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
speaker_encoder_loss_alpha=9.0,
)

View File

@ -210,7 +210,7 @@ class TestVits(unittest.TestCase):
num_chars=32,
use_d_vector_file=True,
d_vector_dim=256,
d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
)
config = VitsConfig(model_args=args)
model = Vits.init_from_config(config, verbose=False).to(device)
@ -355,7 +355,7 @@ class TestVits(unittest.TestCase):
num_chars=32,
use_d_vector_file=True,
d_vector_dim=256,
d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
)
config = VitsConfig(model_args=args)
model = Vits.init_from_config(config, verbose=False).to(device)
@ -587,7 +587,7 @@ class TestVits(unittest.TestCase):
num_chars=32,
use_d_vector_file=True,
d_vector_dim=256,
d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
)
)
model = Vits.init_from_config(config, verbose=False).to(device)

View File

@ -33,7 +33,7 @@ config.audio.trim_db = 60
# active multispeaker d-vec mode
config.model_args.use_d_vector_file = True
config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
config.model_args.d_vector_dim = 256

View File

@ -63,8 +63,8 @@ config.use_speaker_embedding = False
# active multispeaker d-vec mode
config.model_args.use_d_vector_file = True
config.use_d_vector_file = True
config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
config.d_vector_file = "tests/data/ljspeech/speakers.json"
config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
config.model_args.d_vector_dim = 256
config.d_vector_dim = 256