From 0536aa6d0f41b125dd96811a8a5b04fac70d6652 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 22 May 2021 17:12:19 +0900
Subject: [PATCH 01/10] Japanese Tacotron 2 model

---
 TTS/tts/configs/kokoro_tacotron2.json | 173 ++++++++++++
 TTS/tts/datasets/preprocess.py        |  14 +
 TTS/tts/utils/japanese/__init__.py    |   1 +
 TTS/tts/utils/japanese/text.py        | 380 ++++++++++++++++++++++++++
 TTS/tts/utils/japanese/text_test.py   |  22 ++
 TTS/tts/utils/text/__init__.py        |   5 +
 requirements.txt                      |   2 +
 7 files changed, 597 insertions(+)
 create mode 100644 TTS/tts/configs/kokoro_tacotron2.json
 create mode 100644 TTS/tts/utils/japanese/__init__.py
 create mode 100644 TTS/tts/utils/japanese/text.py
 create mode 100644 TTS/tts/utils/japanese/text_test.py

diff --git a/TTS/tts/configs/kokoro_tacotron2.json b/TTS/tts/configs/kokoro_tacotron2.json
new file mode 100644
index 00000000..f5d41194
--- /dev/null
+++ b/TTS/tts/configs/kokoro_tacotron2.json
@@ -0,0 +1,173 @@
+{
+    "model": "Tacotron2",
+    "run_name": "kokoro-ddc",
+    "run_description": "tacotron2 with DDC and differential spectral loss.",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        // stft parameters
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // Griffin-Lim
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 7600.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 1,
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": "./scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    "characters":{
+        "pad": "_",
+        "eos": "~",
+        "bos": "^",
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+        "punctuations": "!'(),-.:;? ",
+        "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    },
+
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":16,
+    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "mixed_precision": true,     // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
+
+    // LOSS SETTINGS
+    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
+    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
+
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // OPTIMIZER
+    "noam_schedule": false,        // use noam warmup and lr schedule.
+    "grad_clip": 1.0,              // upper limit for gradients for clipping.
+    "epochs": 1000,                // total number of epochs to train.
+    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "wd": 0.000001,                // Weight decay weight.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
+    // TACOTRON PRENET
+    "memory_size": -1,             // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
+    "prenet_type": "original",     // "original" or "bn".
+    "prenet_dropout": true,       // enable/disable dropout at prenet.
+
+    // TACOTRON ATTENTION
+    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
+    "attention_norm": "sigmoid",   // softmax or sigmoid.
+    "windowing": false,            // Enables attention windowing. Used only in eval mode.
+    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
+    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "bidirectional_decoder": false,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
+    "double_decoder_consistency": true,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
+    "ddc_r": 7,                           // reduction rate for coarse decoder.
+
+    // STOPNET
+    "stopnet": true,               // Train stopnet predicting the end of synthesis.
+    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+
+    // TENSORBOARD and LOGGING
+    "print_step": 25,       // Number of steps to log training on console.
+    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "keep_all_best": false,  // If true, keeps all best_models after keep_after steps
+    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "text_cleaner": "basic_cleaners",
+    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+    "batch_group_size": 4,  //Number of batches to shuffle after bucketing.
+    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
+    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
+    "compute_input_seq_cache": false,  // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
+    "use_noise_augment": true,
+
+    // PATHS
+    "output_path": "./Models/Kokoro/",
+
+    // PHONEMES
+    "phoneme_cache_path": "./phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
+    "phoneme_language": "ja-jp",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+    // MULTI-SPEAKER and GST
+    "use_speaker_embedding": false,      // use speaker embedding to enable multi-speaker learning.
+    "use_gst": false,       			    // use global style tokens
+    "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
+    "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
+    "gst":	{			                // gst parameter if gst is enabled
+        "gst_style_input": null,        // Condition the style input either on a
+                                        // -> wave file [path to wave] or
+                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
+                                        // with the dictionary being len(dict) <= len(gst_style_tokens).
+        "gst_embedding_dim": 512,
+        "gst_num_heads": 4,
+        "gst_style_tokens": 10,
+        "gst_use_speaker_embedding": false
+	},
+
+    // DATASETS
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+        [
+            {
+                "name": "kokoro",
+                "path": "./kokoro-speech-v1_1-small/",
+                "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
+                "meta_file_val": null
+            }
+        ]
+}
diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py
index 72ab160e..271b1734 100644
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@@ -424,3 +424,17 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]:
             wav_path = os.path.join(root_path, "clips_22", wav_name)
             items.append([text, wav_path, speaker_name])
     return items
+
+
+def kokoro(root_path, meta_file):
+    """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "kokoro"
+    with open(txt_file, "r") as ttf:
+        for line in ttf:
+            cols = line.split("|")
+            wav_file = os.path.join(root_path, "wavs", cols[0] + '.wav')
+            text = cols[2].replace(" ", "")
+            items.append([text, wav_file, speaker_name])
+    return items
diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py
new file mode 100644
index 00000000..0ce7a99d
--- /dev/null
+++ b/TTS/tts/utils/japanese/__init__.py
@@ -0,0 +1 @@
+from .text import japanese_text2phone
\ No newline at end of file
diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/japanese/text.py
new file mode 100644
index 00000000..4c8936ac
--- /dev/null
+++ b/TTS/tts/utils/japanese/text.py
@@ -0,0 +1,380 @@
+# Convert Japanese text to phonemes which is
+# compatible with Julius https://github.com/julius-speech/segmentation-kit
+
+import re
+import MeCab
+from typing import List, Tuple
+
+_CONVRULES = [
+    # Conversion of 2 letters
+    'アァ/ a a',
+    'イィ/ i i',
+    'イェ/ i e',
+    'イャ/ y a',
+    'ウゥ/ u:',
+    'エェ/ e e',
+    'オォ/ o:',
+    'カァ/ k a:',
+    'キィ/ k i:',
+    'クゥ/ k u:',
+    'クャ/ ky a',
+    'クュ/ ky u',
+    'クョ/ ky o',
+    'ケェ/ k e:',
+    'コォ/ k o:',
+    'ガァ/ g a:',
+    'ギィ/ g i:',
+    'グゥ/ g u:',
+    'グャ/ gy a',
+    'グュ/ gy u',
+    'グョ/ gy o',
+    'ゲェ/ g e:',
+    'ゴォ/ g o:',
+    'サァ/ s a:',
+    'シィ/ sh i:',
+    'スゥ/ s u:',
+    'スャ/ sh a',
+    'スュ/ sh u',
+    'スョ/ sh o',
+    'セェ/ s e:',
+    'ソォ/ s o:',
+    'ザァ/ z a:',
+    'ジィ/ j i:',
+    'ズゥ/ z u:',
+    'ズャ/ zy a',
+    'ズュ/ zy u',
+    'ズョ/ zy o',
+    'ゼェ/ z e:',
+    'ゾォ/ z o:',
+    'タァ/ t a:',
+    'チィ/ ch i:',
+    'ツァ/ ts a',
+    'ツィ/ ts i',
+    'ツゥ/ ts u:',
+    'ツャ/ ch a',
+    'ツュ/ ch u',
+    'ツョ/ ch o',
+    'ツェ/ ts e',
+    'ツォ/ ts o',
+    'テェ/ t e:',
+    'トォ/ t o:',
+    'ダァ/ d a:',
+    'ヂィ/ j i:',
+    'ヅゥ/ d u:',
+    'ヅャ/ zy a',
+    'ヅュ/ zy u',
+    'ヅョ/ zy o',
+    'デェ/ d e:',
+    'ドォ/ d o:',
+    'ナァ/ n a:',
+    'ニィ/ n i:',
+    'ヌゥ/ n u:',
+    'ヌャ/ ny a',
+    'ヌュ/ ny u',
+    'ヌョ/ ny o',
+    'ネェ/ n e:',
+    'ノォ/ n o:',
+    'ハァ/ h a:',
+    'ヒィ/ h i:',
+    'フゥ/ f u:',
+    'フャ/ hy a',
+    'フュ/ hy u',
+    'フョ/ hy o',
+    'ヘェ/ h e:',
+    'ホォ/ h o:',
+    'バァ/ b a:',
+    'ビィ/ b i:',
+    'ブゥ/ b u:',
+    'フャ/ hy a',
+    'ブュ/ by u',
+    'フョ/ hy o',
+    'ベェ/ b e:',
+    'ボォ/ b o:',
+    'パァ/ p a:',
+    'ピィ/ p i:',
+    'プゥ/ p u:',
+    'プャ/ py a',
+    'プュ/ py u',
+    'プョ/ py o',
+    'ペェ/ p e:',
+    'ポォ/ p o:',
+    'マァ/ m a:',
+    'ミィ/ m i:',
+    'ムゥ/ m u:',
+    'ムャ/ my a',
+    'ムュ/ my u',
+    'ムョ/ my o',
+    'メェ/ m e:',
+    'モォ/ m o:',
+    'ヤァ/ y a:',
+    'ユゥ/ y u:',
+    'ユャ/ y a:',
+    'ユュ/ y u:',
+    'ユョ/ y o:',
+    'ヨォ/ y o:',
+    'ラァ/ r a:',
+    'リィ/ r i:',
+    'ルゥ/ r u:',
+    'ルャ/ ry a',
+    'ルュ/ ry u',
+    'ルョ/ ry o',
+    'レェ/ r e:',
+    'ロォ/ r o:',
+    'ワァ/ w a:',
+    'ヲォ/ o:',
+    'ディ/ d i',
+    'デェ/ d e:',
+    'デャ/ dy a',
+    'デュ/ dy u',
+    'デョ/ dy o',
+    'ティ/ t i',
+    'テェ/ t e:',
+    'テャ/ ty a',
+    'テュ/ ty u',
+    'テョ/ ty o',
+    'スィ/ s i',
+    'ズァ/ z u a',
+    'ズィ/ z i',
+    'ズゥ/ z u',
+    'ズャ/ zy a',
+    'ズュ/ zy u',
+    'ズョ/ zy o',
+    'ズェ/ z e',
+    'ズォ/ z o',
+    'キャ/ ky a',
+    'キュ/ ky u',
+    'キョ/ ky o',
+    'シャ/ sh a',
+    'シュ/ sh u',
+    'シェ/ sh e',
+    'ショ/ sh o',
+    'チャ/ ch a',
+    'チュ/ ch u',
+    'チェ/ ch e',
+    'チョ/ ch o',
+    'トゥ/ t u',
+    'トャ/ ty a',
+    'トュ/ ty u',
+    'トョ/ ty o',
+    'ドァ/ d o a',
+    'ドゥ/ d u',
+    'ドャ/ dy a',
+    'ドュ/ dy u',
+    'ドョ/ dy o',
+    'ドォ/ d o:',
+    'ニャ/ ny a',
+    'ニュ/ ny u',
+    'ニョ/ ny o',
+    'ヒャ/ hy a',
+    'ヒュ/ hy u',
+    'ヒョ/ hy o',
+    'ミャ/ my a',
+    'ミュ/ my u',
+    'ミョ/ my o',
+    'リャ/ ry a',
+    'リュ/ ry u',
+    'リョ/ ry o',
+    'ギャ/ gy a',
+    'ギュ/ gy u',
+    'ギョ/ gy o',
+    'ヂェ/ j e',
+    'ヂャ/ j a',
+    'ヂュ/ j u',
+    'ヂョ/ j o',
+    'ジェ/ j e',
+    'ジャ/ j a',
+    'ジュ/ j u',
+    'ジョ/ j o',
+    'ビャ/ by a',
+    'ビュ/ by u',
+    'ビョ/ by o',
+    'ピャ/ py a',
+    'ピュ/ py u',
+    'ピョ/ py o',
+    'ウァ/ u a',
+    'ウィ/ w i',
+    'ウェ/ w e',
+    'ウォ/ w o',
+    'ファ/ f a',
+    'フィ/ f i',
+    'フゥ/ f u',
+    'フャ/ hy a',
+    'フュ/ hy u',
+    'フョ/ hy o',
+    'フェ/ f e',
+    'フォ/ f o',
+    'ヴァ/ b a',
+    'ヴィ/ b i',
+    'ヴェ/ b e',
+    'ヴォ/ b o',
+    'ヴュ/ by u',
+
+    # Conversion of 1 letter
+    'ア/ a',
+    'イ/ i',
+    'ウ/ u',
+    'エ/ e',
+    'オ/ o',
+    'カ/ k a',
+    'キ/ k i',
+    'ク/ k u',
+    'ケ/ k e',
+    'コ/ k o',
+    'サ/ s a',
+    'シ/ sh i',
+    'ス/ s u',
+    'セ/ s e',
+    'ソ/ s o',
+    'タ/ t a',
+    'チ/ ch i',
+    'ツ/ ts u',
+    'テ/ t e',
+    'ト/ t o',
+    'ナ/ n a',
+    'ニ/ n i',
+    'ヌ/ n u',
+    'ネ/ n e',
+    'ノ/ n o',
+    'ハ/ h a',
+    'ヒ/ h i',
+    'フ/ f u',
+    'ヘ/ h e',
+    'ホ/ h o',
+    'マ/ m a',
+    'ミ/ m i',
+    'ム/ m u',
+    'メ/ m e',
+    'モ/ m o',
+    'ラ/ r a',
+    'リ/ r i',
+    'ル/ r u',
+    'レ/ r e',
+    'ロ/ r o',
+    'ガ/ g a',
+    'ギ/ g i',
+    'グ/ g u',
+    'ゲ/ g e',
+    'ゴ/ g o',
+    'ザ/ z a',
+    'ジ/ j i',
+    'ズ/ z u',
+    'ゼ/ z e',
+    'ゾ/ z o',
+    'ダ/ d a',
+    'ヂ/ j i',
+    'ヅ/ z u',
+    'デ/ d e',
+    'ド/ d o',
+    'バ/ b a',
+    'ビ/ b i',
+    'ブ/ b u',
+    'ベ/ b e',
+    'ボ/ b o',
+    'パ/ p a',
+    'ピ/ p i',
+    'プ/ p u',
+    'ペ/ p e',
+    'ポ/ p o',
+    'ヤ/ y a',
+    'ユ/ y u',
+    'ヨ/ y o',
+    'ワ/ w a',
+    'ヰ/ i',
+    'ヱ/ e',
+    'ヲ/ o',
+    'ン/ N',
+    'ッ/ q',
+    'ヴ/ b u',
+    'ー/:',
+
+    # Try converting broken text
+    'ァ/ a',
+    'ィ/ i',
+    'ゥ/ u',
+    'ェ/ e',
+    'ォ/ o',
+    'ヮ/ w a',
+    'ォ/ o',
+
+    # Symbols
+    '、/ ,',
+    '。/ .',
+    '！/ !',
+    '？/ ?',
+    '・/ ,'
+]
+
+_COLON_RX = re.compile(':+')
+_REJECT_RX = re.compile('[^ a-zA-Z:,.?]')
+
+def _makerulemap():
+    l = [tuple(x.split('/')) for x in _CONVRULES]
+    return tuple(
+        {k: v for k, v in l if len(k) == i}
+        for i in (1, 2)
+    )
+
+_RULEMAP1, _RULEMAP2 = _makerulemap()
+
+def kata2phoneme(text: str) -> str:
+    """Convert katakana text to phonemes.
+    """
+    text = text.strip()
+    res = ''
+    while text:
+        if len(text) >= 2:
+            x = _RULEMAP2.get(text[:2])
+            if x is not None:
+                text = text[2:]
+                res += x
+                continue
+        x = _RULEMAP1.get(text[0])
+        if x is not None:
+            text = text[1:]
+            res += x
+            continue
+        res += ' ' + text[0]
+        text = text[1:]
+    res = _COLON_RX.sub(':', res)
+    return res[1:]
+
+_KATAKANA = ''.join(chr(ch) for ch in range(ord('ァ'), ord('ン') + 1))
+_HIRAGANA = ''.join(chr(ch) for ch in range(ord('ぁ'), ord('ん') + 1))
+_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
+
+def hira2kata(text: str) -> str:
+    text = text.translate(_HIRA2KATATRANS)
+    return text.replace('う゛', 'ヴ')
+
+_SYMBOL_TOKENS = set(list('・、。？！'))
+_NO_YOMI_TOKENS = set(list('「」『』―（）［］[]　…'))
+_TAGGER = MeCab.Tagger()
+
+def text2kata(text: str) -> str:
+    parsed = _TAGGER.parse(text)
+    res = []
+    for line in parsed.split('\n'):
+        if line == 'EOS':
+            break
+        parts = line.split('\t')
+
+        word, yomi = parts[0], parts[1]
+        if yomi:
+            res.append(yomi)
+        else:
+            if word in _SYMBOL_TOKENS:
+                res.append(word)
+            elif word == 'っ' or word == 'ッ':
+                res.append('ッ')
+            elif word in _NO_YOMI_TOKENS:
+                pass
+            else:
+                res.append(word)
+    return hira2kata(''.join(res))
+
+def japanese_text2phone(text: str) -> str:
+    """Convert Japanese text to phonemes.
+    """
+    res = text2kata(text)
+    res = kata2phoneme(res)
+    return res.replace(' ', '')
\ No newline at end of file
diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/japanese/text_test.py
new file mode 100644
index 00000000..7a04925a
--- /dev/null
+++ b/TTS/tts/utils/japanese/text_test.py
@@ -0,0 +1,22 @@
+import unittest
+from . import japanese_text2phone
+
+_TEST_CASES = '''
+どちらに行きますか？/dochiraniikimasuka?
+今日は温泉に、行きます。/kyo:waoNseNni,ikimasu.
+「A」から「Z」までです。/AkaraZmadedesu.
+そうですね！/so:desune!
+クジラは哺乳類です。/kujirawahonyu:ruidesu.
+ヴィディオを見ます。/bidioomimasu.
+ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu.
+'''
+
+class TestText(unittest.TestCase):
+
+    def test_text2phone(self):
+        for line in _TEST_CASES.strip().split('\n'):
+            text, phone = line.split('/')
+            self.assertEqual(japanese_text2phone(text), phone)
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 9367e6e2..9b63e7f1 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -39,6 +39,11 @@ def text2phone(text, language):
     if language == "zh-CN":
         ph = chinese_text_to_phonemes(text)
         return ph
+    elif language == "ja-jp":
+        from TTS.tts.utils.japanese import japanese_text2phone
+        ph = japanese_text2phone(text)
+        return ph
+
     raise ValueError(f" [!] Language {language} is not supported for phonemization.")
 
 
diff --git a/requirements.txt b/requirements.txt
index c6ce7672..7f45f9e0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,5 @@ numba==0.52
 umap-learn==0.4.6
 unidecode==0.4.20
 coqpit
+mecab-python3
+unidic-lite

From f921a05bdb6ce6fc950c290b8a3aec613a7f70fe Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Wed, 26 May 2021 19:02:16 +0900
Subject: [PATCH 02/10] Fixed lint errors

---
 TTS/tts/utils/japanese/__init__.py  | 2 +-
 TTS/tts/utils/japanese/text.py      | 5 ++---
 TTS/tts/utils/japanese/text_test.py | 2 +-
 TTS/tts/utils/text/__init__.py      | 3 ++-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py
index 0ce7a99d..30d963e8 100644
--- a/TTS/tts/utils/japanese/__init__.py
+++ b/TTS/tts/utils/japanese/__init__.py
@@ -1 +1 @@
-from .text import japanese_text2phone
\ No newline at end of file
+from .text import japanese_text2phone
diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/japanese/text.py
index 4c8936ac..3a705352 100644
--- a/TTS/tts/utils/japanese/text.py
+++ b/TTS/tts/utils/japanese/text.py
@@ -3,7 +3,6 @@
 
 import re
 import MeCab
-from typing import List, Tuple
 
 _CONVRULES = [
     # Conversion of 2 letters
@@ -364,7 +363,7 @@ def text2kata(text: str) -> str:
         else:
             if word in _SYMBOL_TOKENS:
                 res.append(word)
-            elif word == 'っ' or word == 'ッ':
+            elif word in ('っ', 'ッ'):
                 res.append('ッ')
             elif word in _NO_YOMI_TOKENS:
                 pass
@@ -377,4 +376,4 @@ def japanese_text2phone(text: str) -> str:
     """
     res = text2kata(text)
     res = kata2phoneme(res)
-    return res.replace(' ', '')
\ No newline at end of file
+    return res.replace(' ', '')
diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/japanese/text_test.py
index 7a04925a..d3ade826 100644
--- a/TTS/tts/utils/japanese/text_test.py
+++ b/TTS/tts/utils/japanese/text_test.py
@@ -19,4 +19,4 @@ class TestText(unittest.TestCase):
             self.assertEqual(japanese_text2phone(text), phone)
 
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 9b63e7f1..d7423102 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -39,7 +39,8 @@ def text2phone(text, language):
     if language == "zh-CN":
         ph = chinese_text_to_phonemes(text)
         return ph
-    elif language == "ja-jp":
+
+    if language == "ja-jp":
         from TTS.tts.utils.japanese import japanese_text2phone
         ph = japanese_text2phone(text)
         return ph

From c4987e9d4e503628df5661c0945b2047ea046b1f Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Fri, 28 May 2021 00:22:57 +0900
Subject: [PATCH 03/10] Move import at the head of the file.

---
 TTS/tts/utils/text/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index d7423102..f6b46783 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -6,6 +6,7 @@ from packaging import version
 
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
+from TTS.tts.utils.japanese import japanese_text2phone
 from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols
 
 # pylint: disable=unnecessary-comprehension
@@ -41,7 +42,6 @@ def text2phone(text, language):
         return ph
 
     if language == "ja-jp":
-        from TTS.tts.utils.japanese import japanese_text2phone
         ph = japanese_text2phone(text)
         return ph
 

From d0c9c1ca5c28d37845ea7a19d399851c5bfd5429 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 29 May 2021 09:21:47 +0900
Subject: [PATCH 04/10] Move TTS/tts/utils/japanese

---
 TTS/tts/utils/japanese/__init__.py                          | 1 -
 TTS/tts/utils/text/__init__.py                              | 4 ++--
 TTS/tts/utils/text/japanese/__init__.py                     | 0
 .../utils/{japanese/text.py => text/japanese/phonemizer.py} | 2 +-
 .../text_test.py => text/japanese/phonemizer_test.py}       | 6 +++---
 5 files changed, 6 insertions(+), 7 deletions(-)
 delete mode 100644 TTS/tts/utils/japanese/__init__.py
 create mode 100644 TTS/tts/utils/text/japanese/__init__.py
 rename TTS/tts/utils/{japanese/text.py => text/japanese/phonemizer.py} (99%)
 rename TTS/tts/utils/{japanese/text_test.py => text/japanese/phonemizer_test.py} (77%)

diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py
deleted file mode 100644
index 30d963e8..00000000
--- a/TTS/tts/utils/japanese/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .text import japanese_text2phone
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index f6b46783..f9f44167 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -6,7 +6,7 @@ from packaging import version
 
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
-from TTS.tts.utils.japanese import japanese_text2phone
+from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols
 
 # pylint: disable=unnecessary-comprehension
@@ -42,7 +42,7 @@ def text2phone(text, language):
         return ph
 
     if language == "ja-jp":
-        ph = japanese_text2phone(text)
+        ph = japanese_text_to_phonemes(text)
         return ph
 
     raise ValueError(f" [!] Language {language} is not supported for phonemization.")
diff --git a/TTS/tts/utils/text/japanese/__init__.py b/TTS/tts/utils/text/japanese/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/text/japanese/phonemizer.py
similarity index 99%
rename from TTS/tts/utils/japanese/text.py
rename to TTS/tts/utils/text/japanese/phonemizer.py
index 3a705352..f09d5b05 100644
--- a/TTS/tts/utils/japanese/text.py
+++ b/TTS/tts/utils/text/japanese/phonemizer.py
@@ -371,7 +371,7 @@ def text2kata(text: str) -> str:
                 res.append(word)
     return hira2kata(''.join(res))
 
-def japanese_text2phone(text: str) -> str:
+def japanese_text_to_phonemes(text: str) -> str:
     """Convert Japanese text to phonemes.
     """
     res = text2kata(text)
diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/text/japanese/phonemizer_test.py
similarity index 77%
rename from TTS/tts/utils/japanese/text_test.py
rename to TTS/tts/utils/text/japanese/phonemizer_test.py
index d3ade826..f07c0901 100644
--- a/TTS/tts/utils/japanese/text_test.py
+++ b/TTS/tts/utils/text/japanese/phonemizer_test.py
@@ -1,5 +1,5 @@
 import unittest
-from . import japanese_text2phone
+from .phonemizer import japanese_text_to_phonemes
 
 _TEST_CASES = '''
 どちらに行きますか？/dochiraniikimasuka?
@@ -13,10 +13,10 @@ ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu.
 
 class TestText(unittest.TestCase):
 
-    def test_text2phone(self):
+    def test_japanese_text_to_phonemes(self):
         for line in _TEST_CASES.strip().split('\n'):
             text, phone = line.split('/')
-            self.assertEqual(japanese_text2phone(text), phone)
+            self.assertEqual(japanese_text_to_phonemes(text), phone)
 
 if __name__ == '__main__':
     unittest.main()

From 29d61741ecdc9c377cf3ff3bda622233304e7127 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 29 May 2021 19:03:23 +0900
Subject: [PATCH 05/10] Copied recipe

---
 recipes/kokoro/tacotron2-DDC/run.sh           | 22 +++++
 .../kokoro/tacotron2-DDC/tacotron2-DDC.json   | 91 +++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 recipes/kokoro/tacotron2-DDC/run.sh
 create mode 100644 recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json

diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh
new file mode 100644
index 00000000..eaa05b60
--- /dev/null
+++ b/recipes/kokoro/tacotron2-DDC/run.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# take the scripts's parent's directory to prefix all the output paths.
+RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+echo $RUN_DIR
+# download LJSpeech dataset
+wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+# extract
+tar -xjf LJSpeech-1.1.tar.bz2
+# create train-val splits
+shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
+head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
+tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
+mv LJSpeech-1.1 $RUN_DIR/
+rm LJSpeech-1.1.tar.bz2
+# compute dataset mean and variance for normalization
+python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/
+# training ....
+# change the GPU id if needed
+CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \
+                                                          --coqpit.output_path $RUN_DIR  \
+                                                          --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/    \
+                                                          --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
\ No newline at end of file
diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
new file mode 100644
index 00000000..9cdbbd3b
--- /dev/null
+++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
@@ -0,0 +1,91 @@
+{
+    "datasets": [
+        {
+            "name": "ljspeech",
+            "path": "DEFINE THIS",
+            "meta_file_train": "metadata.csv",
+            "meta_file_val": null
+        }
+    ],
+    "audio": {
+        "fft_size": 1024,
+        "win_length": 1024,
+        "hop_length": 256,
+        "frame_length_ms": null,
+        "frame_shift_ms": null,
+        "sample_rate": 22050,
+        "preemphasis": 0.0,
+        "ref_level_db": 20,
+        "do_trim_silence": true,
+        "trim_db": 60,
+        "power": 1.5,
+        "griffin_lim_iters": 60,
+        "num_mels": 80,
+        "mel_fmin": 50.0,
+        "mel_fmax": 7600.0,
+        "spec_gain": 1,
+        "signal_norm": true,
+        "min_level_db": -100,
+        "symmetric_norm": true,
+        "max_norm": 4.0,
+        "clip_norm": true,
+        "stats_path": "scale_stats.npy"
+    },
+    "gst":{
+        "gst_embedding_dim": 256,
+        "gst_num_heads": 4,
+        "gst_num_style_tokens": 10
+    },
+    "model": "Tacotron2",
+    "run_name": "ljspeech-ddc",
+    "run_description": "tacotron2 with double decoder consistency.",
+    "batch_size": 64,
+    "eval_batch_size": 16,
+    "mixed_precision": true,
+    "loss_masking": true,
+    "decoder_loss_alpha": 0.25,
+    "postnet_loss_alpha": 0.25,
+    "postnet_diff_spec_alpha": 0.25,
+    "decoder_diff_spec_alpha": 0.25,
+    "decoder_ssim_alpha": 0.25,
+    "postnet_ssim_alpha": 0.25,
+    "ga_alpha": 5.0,
+    "stopnet_pos_weight": 15.0,
+    "run_eval": true,
+    "test_delay_epochs": 10,
+    "test_sentences_file": null,
+    "noam_schedule": true,
+    "grad_clip": 0.05,
+    "epochs": 1000,
+    "lr": 0.001,
+    "wd": 1e-06,
+    "warmup_steps": 4000,
+    "memory_size": -1,
+    "prenet_type": "original",
+    "prenet_dropout": true,
+    "attention_type": "original",
+    "location_attn": true,
+    "double_decoder_consistency": true,
+    "ddc_r": 6,
+    "attention_norm": "sigmoid",
+    "r": 6,
+    "gradual_training": [[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
+    "stopnet": true,
+    "separate_stopnet": true,
+    "print_step": 25,
+    "tb_plot_step": 100,
+    "print_eval": false,
+    "save_step": 10000,
+    "checkpoint": true,
+    "text_cleaner": "phoneme_cleaners",
+    "num_loader_workers": 4,
+    "num_val_loader_workers": 4,
+    "batch_group_size": 4,
+    "min_seq_len": 6,
+    "max_seq_len": 180,
+    "compute_input_seq_cache": true,
+    "output_path": "DEFINE THIS",
+    "phoneme_cache_path": "DEFINE THIS",
+    "use_phonemes": false,
+    "phoneme_language": "en-us"
+}
\ No newline at end of file

From c4a5a73f186c40dc80c043edf4300198781769d6 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 29 May 2021 19:17:27 +0900
Subject: [PATCH 06/10] update Kokoro config

---
 TTS/tts/configs/kokoro_tacotron2.json         | 173 ------------------
 .../kokoro/tacotron2-DDC/tacotron2-DDC.json   |  82 ++++++---
 2 files changed, 58 insertions(+), 197 deletions(-)
 delete mode 100644 TTS/tts/configs/kokoro_tacotron2.json

diff --git a/TTS/tts/configs/kokoro_tacotron2.json b/TTS/tts/configs/kokoro_tacotron2.json
deleted file mode 100644
index f5d41194..00000000
--- a/TTS/tts/configs/kokoro_tacotron2.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
-    "model": "Tacotron2",
-    "run_name": "kokoro-ddc",
-    "run_description": "tacotron2 with DDC and differential spectral loss.",
-
-    // AUDIO PARAMETERS
-    "audio":{
-        // stft parameters
-        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
-        "win_length": 1024,      // stft window length in ms.
-        "hop_length": 256,       // stft window hop-lengh in ms.
-        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
-
-        // Audio processing parameters
-        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
-        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-
-        // Silence trimming
-        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
-
-        // Griffin-Lim
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-
-        // MelSpectrogram parameters
-        "num_mels": 80,         // size of the mel spec frame.
-        "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 7600.0,     // maximum freq level for mel-spec. Tune for dataset!!
-        "spec_gain": 1,
-
-        // Normalization parameters
-        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
-        "min_level_db": -100,   // lower bound for normalization
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "stats_path": "./scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
-    },
-
-    // VOCABULARY PARAMETERS
-    // if custom character set is not defined,
-    // default set in symbols.py is used
-    "characters":{
-        "pad": "_",
-        "eos": "~",
-        "bos": "^",
-        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
-        "punctuations": "!'(),-.:;? ",
-        "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-    },
-
-    // DISTRIBUTED TRAINING
-    "distributed":{
-        "backend": "nccl",
-        "url": "tcp:\/\/localhost:54321"
-    },
-
-    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
-
-    // TRAINING
-    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "eval_batch_size":16,
-    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
-    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
-    "mixed_precision": true,     // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
-
-    // LOSS SETTINGS
-    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
-    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
-    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
-    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
-    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
-    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
-    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
-    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
-    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
-
-
-    // VALIDATION
-    "run_eval": true,
-    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
-    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
-
-    // OPTIMIZER
-    "noam_schedule": false,        // use noam warmup and lr schedule.
-    "grad_clip": 1.0,              // upper limit for gradients for clipping.
-    "epochs": 1000,                // total number of epochs to train.
-    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "wd": 0.000001,                // Weight decay weight.
-    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
-    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
-
-    // TACOTRON PRENET
-    "memory_size": -1,             // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
-    "prenet_type": "original",     // "original" or "bn".
-    "prenet_dropout": true,       // enable/disable dropout at prenet.
-
-    // TACOTRON ATTENTION
-    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
-    "attention_heads": 4,          // number of attention heads (only for 'graves')
-    "attention_norm": "sigmoid",   // softmax or sigmoid.
-    "windowing": false,            // Enables attention windowing. Used only in eval mode.
-    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
-    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
-    "transition_agent": false,     // enable/disable transition agent of forward attention.
-    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
-    "bidirectional_decoder": false,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
-    "double_decoder_consistency": true,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
-    "ddc_r": 7,                           // reduction rate for coarse decoder.
-
-    // STOPNET
-    "stopnet": true,               // Train stopnet predicting the end of synthesis.
-    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
-
-    // TENSORBOARD and LOGGING
-    "print_step": 25,       // Number of steps to log training on console.
-    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
-    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
-    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
-    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
-    "keep_all_best": false,  // If true, keeps all best_models after keep_after steps
-    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
-    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
-
-    // DATA LOADING
-    "text_cleaner": "basic_cleaners",
-    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
-    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
-    "batch_group_size": 4,  //Number of batches to shuffle after bucketing.
-    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
-    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
-    "compute_input_seq_cache": false,  // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
-    "use_noise_augment": true,
-
-    // PATHS
-    "output_path": "./Models/Kokoro/",
-
-    // PHONEMES
-    "phoneme_cache_path": "./phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
-    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
-    "phoneme_language": "ja-jp",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
-
-    // MULTI-SPEAKER and GST
-    "use_speaker_embedding": false,      // use speaker embedding to enable multi-speaker learning.
-    "use_gst": false,       			    // use global style tokens
-    "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
-    "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
-    "gst":	{			                // gst parameter if gst is enabled
-        "gst_style_input": null,        // Condition the style input either on a
-                                        // -> wave file [path to wave] or
-                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
-                                        // with the dictionary being len(dict) <= len(gst_style_tokens).
-        "gst_embedding_dim": 512,
-        "gst_num_heads": 4,
-        "gst_style_tokens": 10,
-        "gst_use_speaker_embedding": false
-	},
-
-    // DATASETS
-    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
-        [
-            {
-                "name": "kokoro",
-                "path": "./kokoro-speech-v1_1-small/",
-                "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
-                "meta_file_val": null
-            }
-        ]
-}
diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
index 9cdbbd3b..1aaec547 100644
--- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
+++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
@@ -1,8 +1,8 @@
 {
     "datasets": [
         {
-            "name": "ljspeech",
-            "path": "DEFINE THIS",
+            "name": "kokoro",
+            "path": "./kokoro-speech-v1_1-tiny/",
             "meta_file_train": "metadata.csv",
             "meta_file_val": null
         }
@@ -32,44 +32,61 @@
         "stats_path": "scale_stats.npy"
     },
     "gst":{
-        "gst_embedding_dim": 256,
+        "gst_style_input": null,
+
+
+
+        "gst_embedding_dim": 512,
         "gst_num_heads": 4,
-        "gst_num_style_tokens": 10
-    },
+        "gst_style_tokens": 10,
+        "gst_use_speaker_embedding": false
+	},
     "model": "Tacotron2",
-    "run_name": "ljspeech-ddc",
-    "run_description": "tacotron2 with double decoder consistency.",
-    "batch_size": 64,
+    "run_name": "kokoro-ddc",
+    "run_description": "tacotron2 with DDC and differential spectral loss.",
+    "batch_size": 32,
     "eval_batch_size": 16,
     "mixed_precision": true,
+    "distributed": {
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+    "reinit_layers": [],
     "loss_masking": true,
-    "decoder_loss_alpha": 0.25,
+    "decoder_loss_alpha": 0.5,
     "postnet_loss_alpha": 0.25,
     "postnet_diff_spec_alpha": 0.25,
     "decoder_diff_spec_alpha": 0.25,
-    "decoder_ssim_alpha": 0.25,
+    "decoder_ssim_alpha": 0.5,
     "postnet_ssim_alpha": 0.25,
     "ga_alpha": 5.0,
     "stopnet_pos_weight": 15.0,
     "run_eval": true,
     "test_delay_epochs": 10,
     "test_sentences_file": null,
-    "noam_schedule": true,
-    "grad_clip": 0.05,
+    "noam_schedule": false,
+    "grad_clip": 1.0,
     "epochs": 1000,
-    "lr": 0.001,
-    "wd": 1e-06,
+    "lr": 0.0001,
+    "wd": 0.000001,
     "warmup_steps": 4000,
+    "seq_len_norm": false,
     "memory_size": -1,
     "prenet_type": "original",
     "prenet_dropout": true,
     "attention_type": "original",
+    "windowing": false,
+    "use_forward_attn": false,
+    "forward_attn_mask": false,
+    "transition_agent": false,
     "location_attn": true,
+    "bidirectional_decoder": false,
     "double_decoder_consistency": true,
-    "ddc_r": 6,
+    "ddc_r": 7,
+    "attention_heads": 4,
     "attention_norm": "sigmoid",
-    "r": 6,
-    "gradual_training": [[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
+    "r": 7,
+    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]],
     "stopnet": true,
     "separate_stopnet": true,
     "print_step": 25,
@@ -77,15 +94,32 @@
     "print_eval": false,
     "save_step": 10000,
     "checkpoint": true,
-    "text_cleaner": "phoneme_cleaners",
+    "keep_all_best": false,
+    "keep_after": 10000,
+    "tb_model_param_stats": false,
+    "text_cleaner": "basic_cleaners",
+    "enable_eos_bos_chars": false,
     "num_loader_workers": 4,
     "num_val_loader_workers": 4,
     "batch_group_size": 4,
     "min_seq_len": 6,
-    "max_seq_len": 180,
-    "compute_input_seq_cache": true,
-    "output_path": "DEFINE THIS",
-    "phoneme_cache_path": "DEFINE THIS",
-    "use_phonemes": false,
-    "phoneme_language": "en-us"
+    "max_seq_len": 153,
+    "compute_input_seq_cache": false,
+    "use_noise_augment": true,
+    "output_path": "./Models/Kokoro/",
+    "phoneme_cache_path": "./phoneme_cache/",
+    "use_phonemes": true,
+    "phoneme_language": "ja-jp",
+    "characters": {
+        "pad": "_",
+        "eos": "~",
+        "bos": "^",
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+        "punctuations": "!'(),-.:;? ",
+        "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    },
+    "use_speaker_embedding": false,
+    "use_gst": false,       			
+    "use_external_speaker_embedding_file": false,
+    "external_speaker_embedding_file": "../../speakers-vctk-en.json"
 }
\ No newline at end of file

From 88f3255962073d84d1c7d559b956a0330a6fd11d Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 29 May 2021 19:39:51 +0900
Subject: [PATCH 07/10] Update Kokoro recipe

---
 recipes/kokoro/tacotron2-DDC/run.sh           | 27 ++++++++++---------
 .../kokoro/tacotron2-DDC/tacotron2-DDC.json   |  6 ++---
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh
index eaa05b60..cd2aaff5 100644
--- a/recipes/kokoro/tacotron2-DDC/run.sh
+++ b/recipes/kokoro/tacotron2-DDC/run.sh
@@ -1,22 +1,23 @@
 #!/bin/bash
 # take the scripts's parent's directory to prefix all the output paths.
 RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+CORPUS=kokoro-speech-v1_1-tiny
 echo $RUN_DIR
-# download LJSpeech dataset
-wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
-# extract
-tar -xjf LJSpeech-1.1.tar.bz2
+if [ \! -d $RUN_DIR/$CORPUS ] ; then
+    echo "$RUN_DIR/$CORPUS doesn't exist."
+    echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus."
+    exit 1
+fi
 # create train-val splits
-shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
-head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
-tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
-mv LJSpeech-1.1 $RUN_DIR/
-rm LJSpeech-1.1.tar.bz2
+shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv
+head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv
+tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv
 # compute dataset mean and variance for normalization
-python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/
+python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/
 # training ....
 # change the GPU id if needed
 CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \
-                                                          --coqpit.output_path $RUN_DIR  \
-                                                          --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/    \
-                                                          --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
\ No newline at end of file
+                                                          --coqpit.output_path $RUN_DIR \
+                                                          --coqpit.datasets.0.path $RUN_DIR/$CORPUS \
+                                                          --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
+                                                          --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \
\ No newline at end of file
diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
index 1aaec547..b3630055 100644
--- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
+++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
@@ -2,7 +2,7 @@
     "datasets": [
         {
             "name": "kokoro",
-            "path": "./kokoro-speech-v1_1-tiny/",
+            "path": "DEFINE THIS",
             "meta_file_train": "metadata.csv",
             "meta_file_val": null
         }
@@ -106,8 +106,8 @@
     "max_seq_len": 153,
     "compute_input_seq_cache": false,
     "use_noise_augment": true,
-    "output_path": "./Models/Kokoro/",
-    "phoneme_cache_path": "./phoneme_cache/",
+    "output_path": "DEFINE THIS",
+    "phoneme_cache_path": "DEFINE THIS",
     "use_phonemes": true,
     "phoneme_language": "ja-jp",
     "characters": {

From 2091e808c82647787b571f1b17e80378d203e830 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 29 May 2021 19:41:00 +0900
Subject: [PATCH 08/10] Fix path

---
 recipes/kokoro/tacotron2-DDC/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh
index cd2aaff5..86fda642 100644
--- a/recipes/kokoro/tacotron2-DDC/run.sh
+++ b/recipes/kokoro/tacotron2-DDC/run.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # take the scripts's parent's directory to prefix all the output paths.
 RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-CORPUS=kokoro-speech-v1_1-tiny
+CORPUS=kokoro-speech-v1_1-small
 echo $RUN_DIR
 if [ \! -d $RUN_DIR/$CORPUS ] ; then
     echo "$RUN_DIR/$CORPUS doesn't exist."

From 1cc18d19729545c83e2a7482b949f896fd714ef4 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Tue, 1 Jun 2021 18:51:34 +0900
Subject: [PATCH 09/10] Move unittest of Japanese phonemizer.

---
 .../tts_tests/test_japanese_phonemizer.py                       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename TTS/tts/utils/text/japanese/phonemizer_test.py => tests/tts_tests/test_japanese_phonemizer.py (89%)

diff --git a/TTS/tts/utils/text/japanese/phonemizer_test.py b/tests/tts_tests/test_japanese_phonemizer.py
similarity index 89%
rename from TTS/tts/utils/text/japanese/phonemizer_test.py
rename to tests/tts_tests/test_japanese_phonemizer.py
index f07c0901..437042f0 100644
--- a/TTS/tts/utils/text/japanese/phonemizer_test.py
+++ b/tests/tts_tests/test_japanese_phonemizer.py
@@ -1,5 +1,5 @@
 import unittest
-from .phonemizer import japanese_text_to_phonemes
+from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 
 _TEST_CASES = '''
 どちらに行きますか？/dochiraniikimasuka?

From 6d8310d2a99de22e3537321acbf48f9b35b00b14 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Wed, 2 Jun 2021 07:48:28 +0900
Subject: [PATCH 10/10] Set the version to the same with the dev branch.

---
 TTS/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/_version.py b/TTS/_version.py
index f4956698..311f216e 100644
--- a/TTS/_version.py
+++ b/TTS/_version.py
@@ -1 +1 @@
-__version__ = "0.0.14.1"
+__version__ = "0.0.14"