From 0536aa6d0f41b125dd96811a8a5b04fac70d6652 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 22 May 2021 17:12:19 +0900 Subject: [PATCH 01/10] Japanese Tacotron 2 model --- TTS/tts/configs/kokoro_tacotron2.json | 173 ++++++++++++ TTS/tts/datasets/preprocess.py | 14 + TTS/tts/utils/japanese/__init__.py | 1 + TTS/tts/utils/japanese/text.py | 380 ++++++++++++++++++++++++++ TTS/tts/utils/japanese/text_test.py | 22 ++ TTS/tts/utils/text/__init__.py | 5 + requirements.txt | 2 + 7 files changed, 597 insertions(+) create mode 100644 TTS/tts/configs/kokoro_tacotron2.json create mode 100644 TTS/tts/utils/japanese/__init__.py create mode 100644 TTS/tts/utils/japanese/text.py create mode 100644 TTS/tts/utils/japanese/text_test.py diff --git a/TTS/tts/configs/kokoro_tacotron2.json b/TTS/tts/configs/kokoro_tacotron2.json new file mode 100644 index 00000000..f5d41194 --- /dev/null +++ b/TTS/tts/configs/kokoro_tacotron2.json @@ -0,0 +1,173 @@ +{ + "model": "Tacotron2", + "run_name": "kokoro-ddc", + "run_description": "tacotron2 with DDC and differential spectral loss.", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": "./scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + "characters":{ + "pad": "_", + "eos": "~", + "bos": "^", + "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + "punctuations": "!'(),-.:;? ", + "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":16, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. + + // LOSS SETTINGS + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled + "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. + + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "basic_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 4, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + "compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage. + "use_noise_augment": true, + + // PATHS + "output_path": "./Models/Kokoro/", + + // PHONEMES + "phoneme_cache_path": "./phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "ja-jp", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": false, // use global style tokens + "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) <= len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10, + "gst_use_speaker_embedding": false + }, + + // DATASETS + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "kokoro", + "path": "./kokoro-speech-v1_1-small/", + "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers + "meta_file_val": null + } + ] +} diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 72ab160e..271b1734 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -424,3 +424,17 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]: wav_path = os.path.join(root_path, "clips_22", wav_name) items.append([text, wav_path, speaker_name]) return items + + +def kokoro(root_path, meta_file): + """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "kokoro" + with open(txt_file, "r") as ttf: + for line in ttf: + cols = line.split("|") + wav_file = os.path.join(root_path, "wavs", cols[0] + '.wav') + text = cols[2].replace(" ", "") + items.append([text, wav_file, speaker_name]) + return items diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py new file mode 100644 index 00000000..0ce7a99d --- /dev/null +++ b/TTS/tts/utils/japanese/__init__.py @@ -0,0 +1 @@ +from .text import japanese_text2phone \ No newline at end of file diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/japanese/text.py new file mode 100644 index 00000000..4c8936ac --- /dev/null +++ b/TTS/tts/utils/japanese/text.py @@ -0,0 +1,380 @@ +# Convert Japanese text to phonemes which is +# compatible with Julius https://github.com/julius-speech/segmentation-kit + +import re +import MeCab +from typing import List, Tuple + +_CONVRULES = [ + # Conversion of 2 letters + 'アァ/ a a', + 'イィ/ i i', + 'イェ/ i e', + 'イャ/ y a', + 'ウゥ/ u:', + 'エェ/ e e', + 'オォ/ o:', + 'カァ/ k a:', + 'キィ/ k i:', + 'クゥ/ k u:', + 'クャ/ ky a', + 'クュ/ ky u', + 'クョ/ ky o', + 'ケェ/ k e:', + 'コォ/ k o:', + 'ガァ/ g a:', + 'ギィ/ g i:', + 'グゥ/ g u:', + 'グャ/ gy a', + 'グュ/ gy u', + 'グョ/ gy o', + 'ゲェ/ g e:', + 'ゴォ/ g o:', + 'サァ/ s a:', + 'シィ/ sh i:', + 'スゥ/ s u:', + 'スャ/ sh a', + 'スュ/ sh u', + 'スョ/ sh o', + 'セェ/ s e:', + 'ソォ/ s o:', + 'ザァ/ z a:', + 'ジィ/ j i:', + 'ズゥ/ z u:', + 'ズャ/ zy a', + 'ズュ/ zy u', + 'ズョ/ zy o', + 'ゼェ/ z e:', + 'ゾォ/ z o:', + 'タァ/ t a:', + 'チィ/ ch i:', + 'ツァ/ ts a', + 'ツィ/ ts i', + 'ツゥ/ ts u:', + 'ツャ/ ch a', + 'ツュ/ ch u', + 'ツョ/ ch o', + 'ツェ/ ts e', + 'ツォ/ ts o', + 'テェ/ t e:', + 'トォ/ t o:', + 'ダァ/ d a:', + 'ヂィ/ j i:', + 'ヅゥ/ d u:', + 'ヅャ/ zy a', + 'ヅュ/ zy u', + 'ヅョ/ zy o', + 'デェ/ d e:', + 'ドォ/ d o:', + 'ナァ/ n a:', + 'ニィ/ n i:', + 'ヌゥ/ n u:', + 'ヌャ/ ny a', + 'ヌュ/ ny u', + 'ヌョ/ ny o', + 'ネェ/ n e:', + 'ノォ/ n o:', + 'ハァ/ h a:', + 'ヒィ/ h i:', + 'フゥ/ f u:', + 'フャ/ hy a', + 'フュ/ hy u', + 'フョ/ hy o', + 'ヘェ/ h e:', + 'ホォ/ h o:', + 'バァ/ b a:', + 'ビィ/ b i:', + 'ブゥ/ b u:', + 'フャ/ hy a', + 'ブュ/ by u', + 'フョ/ hy o', + 'ベェ/ b e:', + 'ボォ/ b o:', + 'パァ/ p a:', + 'ピィ/ p i:', + 'プゥ/ p u:', + 'プャ/ py a', + 'プュ/ py u', + 'プョ/ py o', + 'ペェ/ p e:', + 'ポォ/ p o:', + 'マァ/ m a:', + 'ミィ/ m i:', + 'ムゥ/ m u:', + 'ムャ/ my a', + 'ムュ/ my u', + 'ムョ/ my o', + 'メェ/ m e:', + 'モォ/ m o:', + 'ヤァ/ y a:', + 'ユゥ/ y u:', + 'ユャ/ y a:', + 'ユュ/ y u:', + 'ユョ/ y o:', + 'ヨォ/ y o:', + 'ラァ/ r a:', + 'リィ/ r i:', + 'ルゥ/ r u:', + 'ルャ/ ry a', + 'ルュ/ ry u', + 'ルョ/ ry o', + 'レェ/ r e:', + 'ロォ/ r o:', + 'ワァ/ w a:', + 'ヲォ/ o:', + 'ディ/ d i', + 'デェ/ d e:', + 'デャ/ dy a', + 'デュ/ dy u', + 'デョ/ dy o', + 'ティ/ t i', + 'テェ/ t e:', + 'テャ/ ty a', + 'テュ/ ty u', + 'テョ/ ty o', + 'スィ/ s i', + 'ズァ/ z u a', + 'ズィ/ z i', + 'ズゥ/ z u', + 'ズャ/ zy a', + 'ズュ/ zy u', + 'ズョ/ zy o', + 'ズェ/ z e', + 'ズォ/ z o', + 'キャ/ ky a', + 'キュ/ ky u', + 'キョ/ ky o', + 'シャ/ sh a', + 'シュ/ sh u', + 'シェ/ sh e', + 'ショ/ sh o', + 'チャ/ ch a', + 'チュ/ ch u', + 'チェ/ ch e', + 'チョ/ ch o', + 'トゥ/ t u', + 'トャ/ ty a', + 'トュ/ ty u', + 'トョ/ ty o', + 'ドァ/ d o a', + 'ドゥ/ d u', + 'ドャ/ dy a', + 'ドュ/ dy u', + 'ドョ/ dy o', + 'ドォ/ d o:', + 'ニャ/ ny a', + 'ニュ/ ny u', + 'ニョ/ ny o', + 'ヒャ/ hy a', + 'ヒュ/ hy u', + 'ヒョ/ hy o', + 'ミャ/ my a', + 'ミュ/ my u', + 'ミョ/ my o', + 'リャ/ ry a', + 'リュ/ ry u', + 'リョ/ ry o', + 'ギャ/ gy a', + 'ギュ/ gy u', + 'ギョ/ gy o', + 'ヂェ/ j e', + 'ヂャ/ j a', + 'ヂュ/ j u', + 'ヂョ/ j o', + 'ジェ/ j e', + 'ジャ/ j a', + 'ジュ/ j u', + 'ジョ/ j o', + 'ビャ/ by a', + 'ビュ/ by u', + 'ビョ/ by o', + 'ピャ/ py a', + 'ピュ/ py u', + 'ピョ/ py o', + 'ウァ/ u a', + 'ウィ/ w i', + 'ウェ/ w e', + 'ウォ/ w o', + 'ファ/ f a', + 'フィ/ f i', + 'フゥ/ f u', + 'フャ/ hy a', + 'フュ/ hy u', + 'フョ/ hy o', + 'フェ/ f e', + 'フォ/ f o', + 'ヴァ/ b a', + 'ヴィ/ b i', + 'ヴェ/ b e', + 'ヴォ/ b o', + 'ヴュ/ by u', + + # Conversion of 1 letter + 'ア/ a', + 'イ/ i', + 'ウ/ u', + 'エ/ e', + 'オ/ o', + 'カ/ k a', + 'キ/ k i', + 'ク/ k u', + 'ケ/ k e', + 'コ/ k o', + 'サ/ s a', + 'シ/ sh i', + 'ス/ s u', + 'セ/ s e', + 'ソ/ s o', + 'タ/ t a', + 'チ/ ch i', + 'ツ/ ts u', + 'テ/ t e', + 'ト/ t o', + 'ナ/ n a', + 'ニ/ n i', + 'ヌ/ n u', + 'ネ/ n e', + 'ノ/ n o', + 'ハ/ h a', + 'ヒ/ h i', + 'フ/ f u', + 'ヘ/ h e', + 'ホ/ h o', + 'マ/ m a', + 'ミ/ m i', + 'ム/ m u', + 'メ/ m e', + 'モ/ m o', + 'ラ/ r a', + 'リ/ r i', + 'ル/ r u', + 'レ/ r e', + 'ロ/ r o', + 'ガ/ g a', + 'ギ/ g i', + 'グ/ g u', + 'ゲ/ g e', + 'ゴ/ g o', + 'ザ/ z a', + 'ジ/ j i', + 'ズ/ z u', + 'ゼ/ z e', + 'ゾ/ z o', + 'ダ/ d a', + 'ヂ/ j i', + 'ヅ/ z u', + 'デ/ d e', + 'ド/ d o', + 'バ/ b a', + 'ビ/ b i', + 'ブ/ b u', + 'ベ/ b e', + 'ボ/ b o', + 'パ/ p a', + 'ピ/ p i', + 'プ/ p u', + 'ペ/ p e', + 'ポ/ p o', + 'ヤ/ y a', + 'ユ/ y u', + 'ヨ/ y o', + 'ワ/ w a', + 'ヰ/ i', + 'ヱ/ e', + 'ヲ/ o', + 'ン/ N', + 'ッ/ q', + 'ヴ/ b u', + 'ー/:', + + # Try converting broken text + 'ァ/ a', + 'ィ/ i', + 'ゥ/ u', + 'ェ/ e', + 'ォ/ o', + 'ヮ/ w a', + 'ォ/ o', + + # Symbols + '、/ ,', + '。/ .', + '!/ !', + '?/ ?', + '・/ ,' +] + +_COLON_RX = re.compile(':+') +_REJECT_RX = re.compile('[^ a-zA-Z:,.?]') + +def _makerulemap(): + l = [tuple(x.split('/')) for x in _CONVRULES] + return tuple( + {k: v for k, v in l if len(k) == i} + for i in (1, 2) + ) + +_RULEMAP1, _RULEMAP2 = _makerulemap() + +def kata2phoneme(text: str) -> str: + """Convert katakana text to phonemes. + """ + text = text.strip() + res = '' + while text: + if len(text) >= 2: + x = _RULEMAP2.get(text[:2]) + if x is not None: + text = text[2:] + res += x + continue + x = _RULEMAP1.get(text[0]) + if x is not None: + text = text[1:] + res += x + continue + res += ' ' + text[0] + text = text[1:] + res = _COLON_RX.sub(':', res) + return res[1:] + +_KATAKANA = ''.join(chr(ch) for ch in range(ord('ァ'), ord('ン') + 1)) +_HIRAGANA = ''.join(chr(ch) for ch in range(ord('ぁ'), ord('ん') + 1)) +_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA) + +def hira2kata(text: str) -> str: + text = text.translate(_HIRA2KATATRANS) + return text.replace('う゛', 'ヴ') + +_SYMBOL_TOKENS = set(list('・、。?!')) +_NO_YOMI_TOKENS = set(list('「」『』―()[][] …')) +_TAGGER = MeCab.Tagger() + +def text2kata(text: str) -> str: + parsed = _TAGGER.parse(text) + res = [] + for line in parsed.split('\n'): + if line == 'EOS': + break + parts = line.split('\t') + + word, yomi = parts[0], parts[1] + if yomi: + res.append(yomi) + else: + if word in _SYMBOL_TOKENS: + res.append(word) + elif word == 'っ' or word == 'ッ': + res.append('ッ') + elif word in _NO_YOMI_TOKENS: + pass + else: + res.append(word) + return hira2kata(''.join(res)) + +def japanese_text2phone(text: str) -> str: + """Convert Japanese text to phonemes. + """ + res = text2kata(text) + res = kata2phoneme(res) + return res.replace(' ', '') \ No newline at end of file diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/japanese/text_test.py new file mode 100644 index 00000000..7a04925a --- /dev/null +++ b/TTS/tts/utils/japanese/text_test.py @@ -0,0 +1,22 @@ +import unittest +from . import japanese_text2phone + +_TEST_CASES = ''' +どちらに行きますか?/dochiraniikimasuka? +今日は温泉に、行きます。/kyo:waoNseNni,ikimasu. +「A」から「Z」までです。/AkaraZmadedesu. +そうですね!/so:desune! +クジラは哺乳類です。/kujirawahonyu:ruidesu. +ヴィディオを見ます。/bidioomimasu. +ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu. +''' + +class TestText(unittest.TestCase): + + def test_text2phone(self): + for line in _TEST_CASES.strip().split('\n'): + text, phone = line.split('/') + self.assertEqual(japanese_text2phone(text), phone) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 9367e6e2..9b63e7f1 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -39,6 +39,11 @@ def text2phone(text, language): if language == "zh-CN": ph = chinese_text_to_phonemes(text) return ph + elif language == "ja-jp": + from TTS.tts.utils.japanese import japanese_text2phone + ph = japanese_text2phone(text) + return ph + raise ValueError(f" [!] Language {language} is not supported for phonemization.") diff --git a/requirements.txt b/requirements.txt index c6ce7672..7f45f9e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,5 @@ numba==0.52 umap-learn==0.4.6 unidecode==0.4.20 coqpit +mecab-python3 +unidic-lite From f921a05bdb6ce6fc950c290b8a3aec613a7f70fe Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Wed, 26 May 2021 19:02:16 +0900 Subject: [PATCH 02/10] Fixed lint errors --- TTS/tts/utils/japanese/__init__.py | 2 +- TTS/tts/utils/japanese/text.py | 5 ++--- TTS/tts/utils/japanese/text_test.py | 2 +- TTS/tts/utils/text/__init__.py | 3 ++- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py index 0ce7a99d..30d963e8 100644 --- a/TTS/tts/utils/japanese/__init__.py +++ b/TTS/tts/utils/japanese/__init__.py @@ -1 +1 @@ -from .text import japanese_text2phone \ No newline at end of file +from .text import japanese_text2phone diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/japanese/text.py index 4c8936ac..3a705352 100644 --- a/TTS/tts/utils/japanese/text.py +++ b/TTS/tts/utils/japanese/text.py @@ -3,7 +3,6 @@ import re import MeCab -from typing import List, Tuple _CONVRULES = [ # Conversion of 2 letters @@ -364,7 +363,7 @@ def text2kata(text: str) -> str: else: if word in _SYMBOL_TOKENS: res.append(word) - elif word == 'っ' or word == 'ッ': + elif word in ('っ', 'ッ'): res.append('ッ') elif word in _NO_YOMI_TOKENS: pass @@ -377,4 +376,4 @@ def japanese_text2phone(text: str) -> str: """ res = text2kata(text) res = kata2phoneme(res) - return res.replace(' ', '') \ No newline at end of file + return res.replace(' ', '') diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/japanese/text_test.py index 7a04925a..d3ade826 100644 --- a/TTS/tts/utils/japanese/text_test.py +++ b/TTS/tts/utils/japanese/text_test.py @@ -19,4 +19,4 @@ class TestText(unittest.TestCase): self.assertEqual(japanese_text2phone(text), phone) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 9b63e7f1..d7423102 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -39,7 +39,8 @@ def text2phone(text, language): if language == "zh-CN": ph = chinese_text_to_phonemes(text) return ph - elif language == "ja-jp": + + if language == "ja-jp": from TTS.tts.utils.japanese import japanese_text2phone ph = japanese_text2phone(text) return ph From c4987e9d4e503628df5661c0945b2047ea046b1f Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Fri, 28 May 2021 00:22:57 +0900 Subject: [PATCH 03/10] Move import at the head of the file. --- TTS/tts/utils/text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index d7423102..f6b46783 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -6,6 +6,7 @@ from packaging import version from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes +from TTS.tts.utils.japanese import japanese_text2phone from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols # pylint: disable=unnecessary-comprehension @@ -41,7 +42,6 @@ def text2phone(text, language): return ph if language == "ja-jp": - from TTS.tts.utils.japanese import japanese_text2phone ph = japanese_text2phone(text) return ph From d0c9c1ca5c28d37845ea7a19d399851c5bfd5429 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 09:21:47 +0900 Subject: [PATCH 04/10] Move TTS/tts/utils/japanese --- TTS/tts/utils/japanese/__init__.py | 1 - TTS/tts/utils/text/__init__.py | 4 ++-- TTS/tts/utils/text/japanese/__init__.py | 0 .../utils/{japanese/text.py => text/japanese/phonemizer.py} | 2 +- .../text_test.py => text/japanese/phonemizer_test.py} | 6 +++--- 5 files changed, 6 insertions(+), 7 deletions(-) delete mode 100644 TTS/tts/utils/japanese/__init__.py create mode 100644 TTS/tts/utils/text/japanese/__init__.py rename TTS/tts/utils/{japanese/text.py => text/japanese/phonemizer.py} (99%) rename TTS/tts/utils/{japanese/text_test.py => text/japanese/phonemizer_test.py} (77%) diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py deleted file mode 100644 index 30d963e8..00000000 --- a/TTS/tts/utils/japanese/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .text import japanese_text2phone diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index f6b46783..f9f44167 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -6,7 +6,7 @@ from packaging import version from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes -from TTS.tts.utils.japanese import japanese_text2phone +from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols # pylint: disable=unnecessary-comprehension @@ -42,7 +42,7 @@ def text2phone(text, language): return ph if language == "ja-jp": - ph = japanese_text2phone(text) + ph = japanese_text_to_phonemes(text) return ph raise ValueError(f" [!] Language {language} is not supported for phonemization.") diff --git a/TTS/tts/utils/text/japanese/__init__.py b/TTS/tts/utils/text/japanese/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/text/japanese/phonemizer.py similarity index 99% rename from TTS/tts/utils/japanese/text.py rename to TTS/tts/utils/text/japanese/phonemizer.py index 3a705352..f09d5b05 100644 --- a/TTS/tts/utils/japanese/text.py +++ b/TTS/tts/utils/text/japanese/phonemizer.py @@ -371,7 +371,7 @@ def text2kata(text: str) -> str: res.append(word) return hira2kata(''.join(res)) -def japanese_text2phone(text: str) -> str: +def japanese_text_to_phonemes(text: str) -> str: """Convert Japanese text to phonemes. """ res = text2kata(text) diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/text/japanese/phonemizer_test.py similarity index 77% rename from TTS/tts/utils/japanese/text_test.py rename to TTS/tts/utils/text/japanese/phonemizer_test.py index d3ade826..f07c0901 100644 --- a/TTS/tts/utils/japanese/text_test.py +++ b/TTS/tts/utils/text/japanese/phonemizer_test.py @@ -1,5 +1,5 @@ import unittest -from . import japanese_text2phone +from .phonemizer import japanese_text_to_phonemes _TEST_CASES = ''' どちらに行きますか?/dochiraniikimasuka? @@ -13,10 +13,10 @@ ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu. class TestText(unittest.TestCase): - def test_text2phone(self): + def test_japanese_text_to_phonemes(self): for line in _TEST_CASES.strip().split('\n'): text, phone = line.split('/') - self.assertEqual(japanese_text2phone(text), phone) + self.assertEqual(japanese_text_to_phonemes(text), phone) if __name__ == '__main__': unittest.main() From 29d61741ecdc9c377cf3ff3bda622233304e7127 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 19:03:23 +0900 Subject: [PATCH 05/10] Copied recipe --- recipes/kokoro/tacotron2-DDC/run.sh | 22 +++++ .../kokoro/tacotron2-DDC/tacotron2-DDC.json | 91 +++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 recipes/kokoro/tacotron2-DDC/run.sh create mode 100644 recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh new file mode 100644 index 00000000..eaa05b60 --- /dev/null +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# download LJSpeech dataset +wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# extract +tar -xjf LJSpeech-1.1.tar.bz2 +# create train-val splits +shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +mv LJSpeech-1.1 $RUN_DIR/ +rm LJSpeech-1.1.tar.bz2 +# compute dataset mean and variance for normalization +python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ +# training .... +# change the GPU id if needed +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json new file mode 100644 index 00000000..9cdbbd3b --- /dev/null +++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json @@ -0,0 +1,91 @@ +{ + "datasets": [ + { + "name": "ljspeech", + "path": "DEFINE THIS", + "meta_file_train": "metadata.csv", + "meta_file_val": null + } + ], + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_length_ms": null, + "frame_shift_ms": null, + "sample_rate": 22050, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_trim_silence": true, + "trim_db": 60, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 50.0, + "mel_fmax": 7600.0, + "spec_gain": 1, + "signal_norm": true, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": "scale_stats.npy" + }, + "gst":{ + "gst_embedding_dim": 256, + "gst_num_heads": 4, + "gst_num_style_tokens": 10 + }, + "model": "Tacotron2", + "run_name": "ljspeech-ddc", + "run_description": "tacotron2 with double decoder consistency.", + "batch_size": 64, + "eval_batch_size": 16, + "mixed_precision": true, + "loss_masking": true, + "decoder_loss_alpha": 0.25, + "postnet_loss_alpha": 0.25, + "postnet_diff_spec_alpha": 0.25, + "decoder_diff_spec_alpha": 0.25, + "decoder_ssim_alpha": 0.25, + "postnet_ssim_alpha": 0.25, + "ga_alpha": 5.0, + "stopnet_pos_weight": 15.0, + "run_eval": true, + "test_delay_epochs": 10, + "test_sentences_file": null, + "noam_schedule": true, + "grad_clip": 0.05, + "epochs": 1000, + "lr": 0.001, + "wd": 1e-06, + "warmup_steps": 4000, + "memory_size": -1, + "prenet_type": "original", + "prenet_dropout": true, + "attention_type": "original", + "location_attn": true, + "double_decoder_consistency": true, + "ddc_r": 6, + "attention_norm": "sigmoid", + "r": 6, + "gradual_training": [[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], + "stopnet": true, + "separate_stopnet": true, + "print_step": 25, + "tb_plot_step": 100, + "print_eval": false, + "save_step": 10000, + "checkpoint": true, + "text_cleaner": "phoneme_cleaners", + "num_loader_workers": 4, + "num_val_loader_workers": 4, + "batch_group_size": 4, + "min_seq_len": 6, + "max_seq_len": 180, + "compute_input_seq_cache": true, + "output_path": "DEFINE THIS", + "phoneme_cache_path": "DEFINE THIS", + "use_phonemes": false, + "phoneme_language": "en-us" +} \ No newline at end of file From c4a5a73f186c40dc80c043edf4300198781769d6 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 19:17:27 +0900 Subject: [PATCH 06/10] update Kokoro config --- TTS/tts/configs/kokoro_tacotron2.json | 173 ------------------ .../kokoro/tacotron2-DDC/tacotron2-DDC.json | 82 ++++++--- 2 files changed, 58 insertions(+), 197 deletions(-) delete mode 100644 TTS/tts/configs/kokoro_tacotron2.json diff --git a/TTS/tts/configs/kokoro_tacotron2.json b/TTS/tts/configs/kokoro_tacotron2.json deleted file mode 100644 index f5d41194..00000000 --- a/TTS/tts/configs/kokoro_tacotron2.json +++ /dev/null @@ -1,173 +0,0 @@ -{ - "model": "Tacotron2", - "run_name": "kokoro-ddc", - "run_description": "tacotron2 with DDC and differential spectral loss.", - - // AUDIO PARAMETERS - "audio":{ - // stft parameters - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - - // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - - // Silence trimming - "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - - // Griffin-Lim - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - - // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 1, - - // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": "./scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - - // VOCABULARY PARAMETERS - // if custom character set is not defined, - // default set in symbols.py is used - "characters":{ - "pad": "_", - "eos": "~", - "bos": "^", - "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", - "punctuations": "!'(),-.:;? ", - "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" - }, - - // DISTRIBUTED TRAINING - "distributed":{ - "backend": "nccl", - "url": "tcp:\/\/localhost:54321" - }, - - "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. - - // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":16, - "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. - "mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. - - // LOSS SETTINGS - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled - "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled - "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled - "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled - "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. - - - // VALIDATION - "run_eval": true, - "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - - // OPTIMIZER - "noam_schedule": false, // use noam warmup and lr schedule. - "grad_clip": 1.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "wd": 0.000001, // Weight decay weight. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. - - // TACOTRON PRENET - "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. - "prenet_type": "original", // "original" or "bn". - "prenet_dropout": true, // enable/disable dropout at prenet. - - // TACOTRON ATTENTION - "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' - "attention_heads": 4, // number of attention heads (only for 'graves') - "attention_norm": "sigmoid", // softmax or sigmoid. - "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. - "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. - "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. - "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ - "ddc_r": 7, // reduction rate for coarse decoder. - - // STOPNET - "stopnet": true, // Train stopnet predicting the end of synthesis. - "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log training on console. - "tb_plot_step": 100, // Number of steps to plot TB training figures. - "print_eval": false, // If True, it prints intermediate loss values in evalulation. - "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_all_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "text_cleaner": "basic_cleaners", - "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "batch_group_size": 4, //Number of batches to shuffle after bucketing. - "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 153, // DATASET-RELATED: maximum text length - "compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage. - "use_noise_augment": true, - - // PATHS - "output_path": "./Models/Kokoro/", - - // PHONEMES - "phoneme_cache_path": "./phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "ja-jp", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages - - // MULTI-SPEAKER and GST - "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_gst": false, // use global style tokens - "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "gst": { // gst parameter if gst is enabled - "gst_style_input": null, // Condition the style input either on a - // -> wave file [path to wave] or - // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} - // with the dictionary being len(dict) <= len(gst_style_tokens). - "gst_embedding_dim": 512, - "gst_num_heads": 4, - "gst_style_tokens": 10, - "gst_use_speaker_embedding": false - }, - - // DATASETS - "datasets": // List of datasets. They all merged and they get different speaker_ids. - [ - { - "name": "kokoro", - "path": "./kokoro-speech-v1_1-small/", - "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers - "meta_file_val": null - } - ] -} diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json index 9cdbbd3b..1aaec547 100644 --- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json @@ -1,8 +1,8 @@ { "datasets": [ { - "name": "ljspeech", - "path": "DEFINE THIS", + "name": "kokoro", + "path": "./kokoro-speech-v1_1-tiny/", "meta_file_train": "metadata.csv", "meta_file_val": null } @@ -32,44 +32,61 @@ "stats_path": "scale_stats.npy" }, "gst":{ - "gst_embedding_dim": 256, + "gst_style_input": null, + + + + "gst_embedding_dim": 512, "gst_num_heads": 4, - "gst_num_style_tokens": 10 - }, + "gst_style_tokens": 10, + "gst_use_speaker_embedding": false + }, "model": "Tacotron2", - "run_name": "ljspeech-ddc", - "run_description": "tacotron2 with double decoder consistency.", - "batch_size": 64, + "run_name": "kokoro-ddc", + "run_description": "tacotron2 with DDC and differential spectral loss.", + "batch_size": 32, "eval_batch_size": 16, "mixed_precision": true, + "distributed": { + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + "reinit_layers": [], "loss_masking": true, - "decoder_loss_alpha": 0.25, + "decoder_loss_alpha": 0.5, "postnet_loss_alpha": 0.25, "postnet_diff_spec_alpha": 0.25, "decoder_diff_spec_alpha": 0.25, - "decoder_ssim_alpha": 0.25, + "decoder_ssim_alpha": 0.5, "postnet_ssim_alpha": 0.25, "ga_alpha": 5.0, "stopnet_pos_weight": 15.0, "run_eval": true, "test_delay_epochs": 10, "test_sentences_file": null, - "noam_schedule": true, - "grad_clip": 0.05, + "noam_schedule": false, + "grad_clip": 1.0, "epochs": 1000, - "lr": 0.001, - "wd": 1e-06, + "lr": 0.0001, + "wd": 0.000001, "warmup_steps": 4000, + "seq_len_norm": false, "memory_size": -1, "prenet_type": "original", "prenet_dropout": true, "attention_type": "original", + "windowing": false, + "use_forward_attn": false, + "forward_attn_mask": false, + "transition_agent": false, "location_attn": true, + "bidirectional_decoder": false, "double_decoder_consistency": true, - "ddc_r": 6, + "ddc_r": 7, + "attention_heads": 4, "attention_norm": "sigmoid", - "r": 6, - "gradual_training": [[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], + "r": 7, + "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], "stopnet": true, "separate_stopnet": true, "print_step": 25, @@ -77,15 +94,32 @@ "print_eval": false, "save_step": 10000, "checkpoint": true, - "text_cleaner": "phoneme_cleaners", + "keep_all_best": false, + "keep_after": 10000, + "tb_model_param_stats": false, + "text_cleaner": "basic_cleaners", + "enable_eos_bos_chars": false, "num_loader_workers": 4, "num_val_loader_workers": 4, "batch_group_size": 4, "min_seq_len": 6, - "max_seq_len": 180, - "compute_input_seq_cache": true, - "output_path": "DEFINE THIS", - "phoneme_cache_path": "DEFINE THIS", - "use_phonemes": false, - "phoneme_language": "en-us" + "max_seq_len": 153, + "compute_input_seq_cache": false, + "use_noise_augment": true, + "output_path": "./Models/Kokoro/", + "phoneme_cache_path": "./phoneme_cache/", + "use_phonemes": true, + "phoneme_language": "ja-jp", + "characters": { + "pad": "_", + "eos": "~", + "bos": "^", + "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + "punctuations": "!'(),-.:;? ", + "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + }, + "use_speaker_embedding": false, + "use_gst": false, + "use_external_speaker_embedding_file": false, + "external_speaker_embedding_file": "../../speakers-vctk-en.json" } \ No newline at end of file From 88f3255962073d84d1c7d559b956a0330a6fd11d Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 19:39:51 +0900 Subject: [PATCH 07/10] Update Kokoro recipe --- recipes/kokoro/tacotron2-DDC/run.sh | 27 ++++++++++--------- .../kokoro/tacotron2-DDC/tacotron2-DDC.json | 6 ++--- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh index eaa05b60..cd2aaff5 100644 --- a/recipes/kokoro/tacotron2-DDC/run.sh +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -1,22 +1,23 @@ #!/bin/bash # take the scripts's parent's directory to prefix all the output paths. RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +CORPUS=kokoro-speech-v1_1-tiny echo $RUN_DIR -# download LJSpeech dataset -wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 -# extract -tar -xjf LJSpeech-1.1.tar.bz2 +if [ \! -d $RUN_DIR/$CORPUS ] ; then + echo "$RUN_DIR/$CORPUS doesn't exist." + echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus." + exit 1 +fi # create train-val splits -shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv -head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv -tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv -mv LJSpeech-1.1 $RUN_DIR/ -rm LJSpeech-1.1.tar.bz2 +shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv +head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv +tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv # compute dataset mean and variance for normalization -python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ +python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/ # training .... # change the GPU id if needed CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ - --coqpit.output_path $RUN_DIR \ - --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ - --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ + --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ \ No newline at end of file diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json index 1aaec547..b3630055 100644 --- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json @@ -2,7 +2,7 @@ "datasets": [ { "name": "kokoro", - "path": "./kokoro-speech-v1_1-tiny/", + "path": "DEFINE THIS", "meta_file_train": "metadata.csv", "meta_file_val": null } @@ -106,8 +106,8 @@ "max_seq_len": 153, "compute_input_seq_cache": false, "use_noise_augment": true, - "output_path": "./Models/Kokoro/", - "phoneme_cache_path": "./phoneme_cache/", + "output_path": "DEFINE THIS", + "phoneme_cache_path": "DEFINE THIS", "use_phonemes": true, "phoneme_language": "ja-jp", "characters": { From 2091e808c82647787b571f1b17e80378d203e830 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 19:41:00 +0900 Subject: [PATCH 08/10] Fix path --- recipes/kokoro/tacotron2-DDC/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh index cd2aaff5..86fda642 100644 --- a/recipes/kokoro/tacotron2-DDC/run.sh +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -1,7 +1,7 @@ #!/bin/bash # take the scripts's parent's directory to prefix all the output paths. RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -CORPUS=kokoro-speech-v1_1-tiny +CORPUS=kokoro-speech-v1_1-small echo $RUN_DIR if [ \! -d $RUN_DIR/$CORPUS ] ; then echo "$RUN_DIR/$CORPUS doesn't exist." From 1cc18d19729545c83e2a7482b949f896fd714ef4 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Tue, 1 Jun 2021 18:51:34 +0900 Subject: [PATCH 09/10] Move unittest of Japanese phonemizer. --- .../tts_tests/test_japanese_phonemizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename TTS/tts/utils/text/japanese/phonemizer_test.py => tests/tts_tests/test_japanese_phonemizer.py (89%) diff --git a/TTS/tts/utils/text/japanese/phonemizer_test.py b/tests/tts_tests/test_japanese_phonemizer.py similarity index 89% rename from TTS/tts/utils/text/japanese/phonemizer_test.py rename to tests/tts_tests/test_japanese_phonemizer.py index f07c0901..437042f0 100644 --- a/TTS/tts/utils/text/japanese/phonemizer_test.py +++ b/tests/tts_tests/test_japanese_phonemizer.py @@ -1,5 +1,5 @@ import unittest -from .phonemizer import japanese_text_to_phonemes +from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes _TEST_CASES = ''' どちらに行きますか?/dochiraniikimasuka? From 6d8310d2a99de22e3537321acbf48f9b35b00b14 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Wed, 2 Jun 2021 07:48:28 +0900 Subject: [PATCH 10/10] Set the version to the same with the dev branch. --- TTS/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/_version.py b/TTS/_version.py index f4956698..311f216e 100644 --- a/TTS/_version.py +++ b/TTS/_version.py @@ -1 +1 @@ -__version__ = "0.0.14.1" +__version__ = "0.0.14"