mirror of https://github.com/coqui-ai/TTS.git
Fixed bug related to yourtts speaker embeddings issue (#2234)
* Fixed bug related to yourtts speaker embeddings issue * Reverted code for base_tts * Bug fix on VITS d_vector_file type * Ignore the test speakers on YourTTS recipe * Add speaker encoder model and config on YourTTS recipe to easily do zero-shot inference * Update YourTTS config file * Update ModelManager._update_path to deal with list attributes * Fix lint checks * Remove unused code * Fix unit tests * Reset name_to_id to get the right speaker ids on load_embeddings_from_list_of_files * Set weighted_sampler_multipliers as an empty dict to prevent users' mistakes Co-authored-by: Edresson Casanova <edresson1@gmail.com>pull/2257/head
parent
da93d768b8
commit
42afad5e79
|
@ -4,7 +4,7 @@
|
|||
"multi-dataset":{
|
||||
"your_tts":{
|
||||
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": "e9a1953e",
|
||||
"license": "CC BY-NC-ND 4.0",
|
||||
|
|
|
@ -167,7 +167,7 @@ class VitsConfig(BaseTTSConfig):
|
|||
|
||||
# use d-vectors
|
||||
use_d_vector_file: bool = False
|
||||
d_vector_file: str = None
|
||||
d_vector_file: List[str] = None
|
||||
d_vector_dim: int = None
|
||||
|
||||
def __post_init__(self):
|
||||
|
|
|
@ -477,8 +477,8 @@ class VitsArgs(Coqpit):
|
|||
use_d_vector_file (bool):
|
||||
Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.
|
||||
|
||||
d_vector_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
d_vector_file (List[str]):
|
||||
List of paths to the files including pre-computed speaker embeddings. Defaults to None.
|
||||
|
||||
d_vector_dim (int):
|
||||
Number of d-vector channels. Defaults to 0.
|
||||
|
@ -573,7 +573,7 @@ class VitsArgs(Coqpit):
|
|||
use_speaker_embedding: bool = False
|
||||
num_speakers: int = 0
|
||||
speakers_file: str = None
|
||||
d_vector_file: str = None
|
||||
d_vector_file: List[str] = None
|
||||
speaker_embedding_channels: int = 256
|
||||
use_d_vector_file: bool = False
|
||||
d_vector_dim: int = 0
|
||||
|
|
|
@ -235,6 +235,9 @@ class EmbeddingManager(BaseIDManager):
|
|||
self.embeddings_by_names.update(embeddings_by_names)
|
||||
self.embeddings.update(embeddings)
|
||||
|
||||
# reset name_to_id to get the right speaker ids
|
||||
self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)}
|
||||
|
||||
def get_embedding_by_clip(self, clip_idx: str) -> List:
|
||||
"""Get embedding by clip ID.
|
||||
|
||||
|
|
|
@ -109,10 +109,6 @@ class SpeakerManager(EmbeddingManager):
|
|||
|
||||
if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
|
||||
speaker_manager = SpeakerManager()
|
||||
if get_from_config_or_model_args_with_default(config, "speakers_file", None):
|
||||
speaker_manager = SpeakerManager(
|
||||
d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
|
||||
)
|
||||
if get_from_config_or_model_args_with_default(config, "d_vector_file", None):
|
||||
speaker_manager = SpeakerManager(
|
||||
d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None)
|
||||
|
|
|
@ -339,10 +339,18 @@ class ModelManager(object):
|
|||
sub_conf = sub_conf[fd]
|
||||
else:
|
||||
return
|
||||
sub_conf[field_names[-1]] = new_path
|
||||
if isinstance(sub_conf[field_names[-1]], list):
|
||||
sub_conf[field_names[-1]] = [new_path]
|
||||
else:
|
||||
sub_conf[field_names[-1]] = new_path
|
||||
else:
|
||||
# field name points to a top-level field
|
||||
config[field_name] = new_path
|
||||
if not field_name in config:
|
||||
return
|
||||
if isinstance(config[field_name], list):
|
||||
config[field_name] = [new_path]
|
||||
else:
|
||||
config[field_name] = new_path
|
||||
config.save_json(config_path)
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -57,7 +57,25 @@ if not os.path.exists(VCTK_DOWNLOAD_PATH):
|
|||
|
||||
# init configs
|
||||
vctk_config = BaseDatasetConfig(
|
||||
formatter="vctk", dataset_name="vctk", meta_file_train="", meta_file_val="", path=VCTK_DOWNLOAD_PATH, language="en"
|
||||
formatter="vctk",
|
||||
dataset_name="vctk",
|
||||
meta_file_train="",
|
||||
meta_file_val="",
|
||||
path=VCTK_DOWNLOAD_PATH,
|
||||
language="en",
|
||||
ignored_speakers=[
|
||||
"p261",
|
||||
"p225",
|
||||
"p294",
|
||||
"p347",
|
||||
"p238",
|
||||
"p234",
|
||||
"p248",
|
||||
"p335",
|
||||
"p245",
|
||||
"p326",
|
||||
"p302",
|
||||
], # Ignore the test speakers to full replicate the paper experiment
|
||||
)
|
||||
|
||||
# Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :)
|
||||
|
@ -111,11 +129,11 @@ model_args = VitsArgs(
|
|||
use_d_vector_file=True,
|
||||
d_vector_dim=512,
|
||||
num_layers_text_encoder=10,
|
||||
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
||||
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
||||
resblock_type_decoder="2", # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
||||
# Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper
|
||||
# use_speaker_encoder_as_loss=True,
|
||||
# speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
||||
# speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
||||
# Usefull parameters to the enable multilingual training
|
||||
# use_language_embedding=True,
|
||||
# embedded_language_dim=4,
|
||||
|
@ -207,6 +225,7 @@ config = VitsConfig(
|
|||
use_weighted_sampler=True,
|
||||
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
||||
weighted_sampler_attrs={"speaker_name": 1.0},
|
||||
weighted_sampler_multipliers={},
|
||||
# It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
|
||||
speaker_encoder_loss_alpha=9.0,
|
||||
)
|
||||
|
|
|
@ -210,7 +210,7 @@ class TestVits(unittest.TestCase):
|
|||
num_chars=32,
|
||||
use_d_vector_file=True,
|
||||
d_vector_dim=256,
|
||||
d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
|
||||
d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
|
||||
)
|
||||
config = VitsConfig(model_args=args)
|
||||
model = Vits.init_from_config(config, verbose=False).to(device)
|
||||
|
@ -355,7 +355,7 @@ class TestVits(unittest.TestCase):
|
|||
num_chars=32,
|
||||
use_d_vector_file=True,
|
||||
d_vector_dim=256,
|
||||
d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
|
||||
d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
|
||||
)
|
||||
config = VitsConfig(model_args=args)
|
||||
model = Vits.init_from_config(config, verbose=False).to(device)
|
||||
|
@ -587,7 +587,7 @@ class TestVits(unittest.TestCase):
|
|||
num_chars=32,
|
||||
use_d_vector_file=True,
|
||||
d_vector_dim=256,
|
||||
d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
|
||||
d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
|
||||
)
|
||||
)
|
||||
model = Vits.init_from_config(config, verbose=False).to(device)
|
||||
|
|
|
@ -33,7 +33,7 @@ config.audio.trim_db = 60
|
|||
|
||||
# active multispeaker d-vec mode
|
||||
config.model_args.use_d_vector_file = True
|
||||
config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
|
||||
config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
|
||||
config.model_args.d_vector_dim = 256
|
||||
|
||||
|
||||
|
|
|
@ -63,8 +63,8 @@ config.use_speaker_embedding = False
|
|||
# active multispeaker d-vec mode
|
||||
config.model_args.use_d_vector_file = True
|
||||
config.use_d_vector_file = True
|
||||
config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
|
||||
config.d_vector_file = "tests/data/ljspeech/speakers.json"
|
||||
config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
|
||||
config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
|
||||
config.model_args.d_vector_dim = 256
|
||||
config.d_vector_dim = 256
|
||||
|
||||
|
|
Loading…
Reference in New Issue