clearners update for special chars and conifg update

2020-02-18 12:26:52 +01:00 · 2020-02-18 12:26:52 +01:00 · f707460886
parent 79a3684370
commit f707460886
2 changed files with 8 additions and 5 deletions
--- a/config.json
+++ b/config.json
@ -39,7 +39,7 @@
    "batch_size": 2,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
    "eval_batch_size":16,   
    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.  
-    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "gradual_training": [[0, 7, 64], [2000, 5, 64], [35000, 3, 32], [70000, 2, 32], [140000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
    "grad_accum": 2,        // if N > 1, enable gradient accumulation for N iterations. It is useful for low memory GPUs. 

@ -93,10 +93,10 @@
    "max_seq_len": 150,     // DATASET-RELATED: maximum text length

    // PATHS
-    "output_path": "/data4/rw/home/Trainings/",
+    "output_path": "/home/erogol/Models/",
 
    // PHONEMES
-    "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "phoneme_cache_path": "mozilla_us_phonemes_2_1",  // phoneme computation is slow, therefore, it caches results in the given folder.
    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages

@ -110,7 +110,7 @@
        [
            {
                "name": "ljspeech",
-                "path": "/root/LJSpeech-1.1/",
+                "path": "/home/erogol/Data/LJSpeech-1.1/",
                "meta_file_train": "metadata.csv",
                "meta_file_val": null
            }
--- a/notebooks/Benchmark-PWGAN.ipynb
+++ b/notebooks/Benchmark-PWGAN.ipynb
@ -85,7 +85,10 @@
    "    if use_cuda:\n",
    "        waveform = waveform.cpu()\n",
    "    waveform = waveform.numpy()\n",
-    "    print(\" >  Run-time: {}\".format(time.time() - t_1))\n",
+    "    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
+    "    print(waveform.shape)\n",
+    "    print(\" > Run-time: {}\".format(time.time() - t_1))\n",
+    "    print(\" > Real-time factor: {}\".format(rtf))\n",
    "    if figures:  \n",
    "        visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec))                                                                       \n",
    "    IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False))  \n",