diff --git a/config_tacotron_gst.json b/config_tacotron_gst.json index 98fafa54..3c872730 100644 --- a/config_tacotron_gst.json +++ b/config_tacotron_gst.json @@ -77,6 +77,7 @@ "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "text_cleaner": "phoneme_cleaners", - "use_speaker_embedding": false // whether to use additional embeddings for separate speakers + "use_speaker_embedding": false, // whether to use additional embeddings for separate speakers + "style_wav_for_test": null // path to wav for styling the inference tests when using GST } \ No newline at end of file diff --git a/train.py b/train.py index 815a0a32..c893cb36 100644 --- a/train.py +++ b/train.py @@ -409,11 +409,13 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch): test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None + style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, - speaker_id=speaker_id) + speaker_id=speaker_id, + style_wav=style_wav) file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path,