mirror of https://github.com/coqui-ai/TTS.git
visual updates for phoenemes
parent
a60c9ee47d
commit
caae1af4f6
|
@ -182,4 +182,5 @@ def sequence_mask(sequence_length, max_len=None):
|
|||
seq_range_expand = seq_range_expand.cuda()
|
||||
seq_length_expand = (sequence_length.unsqueeze(1)
|
||||
.expand_as(seq_range_expand))
|
||||
# B x T_max
|
||||
return seq_range_expand < seq_length_expand
|
||||
|
|
|
@ -3,7 +3,7 @@ import time
|
|||
import librosa
|
||||
import torch
|
||||
import numpy as np
|
||||
from .text import text_to_sequence, phoneme_to_sequence
|
||||
from .text import text_to_sequence, phoneme_to_sequence, sequence_to_phoneme
|
||||
from .visual import visualize
|
||||
from matplotlib import pylab as plt
|
||||
|
||||
|
@ -11,8 +11,6 @@ from matplotlib import pylab as plt
|
|||
def synthesis(m, s, CONFIG, use_cuda, ap):
|
||||
""" Given the text, synthesising the audio """
|
||||
text_cleaner = [CONFIG.text_cleaner]
|
||||
# print(phoneme_to_sequence(s, text_cleaner))s
|
||||
# print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner)))
|
||||
if CONFIG.use_phonemes:
|
||||
seq = np.asarray(
|
||||
phoneme_to_sequence(s, text_cleaner, CONFIG.phoneme_language),
|
||||
|
|
|
@ -52,7 +52,7 @@ def phoneme_to_sequence(text, cleaner_names, language):
|
|||
for phoneme in phonemes.split('|'):
|
||||
# print(word, ' -- ', phonemes_text)
|
||||
sequence += _phoneme_to_sequence(phoneme)
|
||||
# Aeepnd EOS char
|
||||
# Append EOS char
|
||||
sequence.append(_phonemes_to_id['~'])
|
||||
return sequence
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ import librosa
|
|||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import matplotlib.pyplot as plt
|
||||
from utils.text import phoneme_to_sequence, sequence_to_phoneme
|
||||
|
||||
|
||||
def plot_alignment(alignment, info=None):
|
||||
|
@ -29,19 +30,22 @@ def plot_spectrogram(linear_output, audio):
|
|||
return fig
|
||||
|
||||
|
||||
def visualize(alignment, spectrogram, stop_tokens, text, hop_length, CONFIG, spectrogram2=None):
|
||||
if spectrogram2 is not None:
|
||||
def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None):
|
||||
if spectrogram is not None:
|
||||
num_plot = 4
|
||||
else:
|
||||
num_plot = 3
|
||||
|
||||
label_fontsize = 16
|
||||
plt.figure(figsize=(16, 32))
|
||||
plt.figure(figsize=(16, 48))
|
||||
|
||||
plt.subplot(num_plot, 1, 1)
|
||||
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
|
||||
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
|
||||
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
|
||||
if CONFIG.use_phonemes:
|
||||
seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language)
|
||||
text = sequence_to_phoneme(seq)
|
||||
plt.yticks(range(len(text)), list(text))
|
||||
plt.colorbar()
|
||||
|
||||
|
@ -50,17 +54,18 @@ def visualize(alignment, spectrogram, stop_tokens, text, hop_length, CONFIG, spe
|
|||
plt.plot(range(len(stop_tokens)), list(stop_tokens))
|
||||
|
||||
plt.subplot(num_plot, 1, 3)
|
||||
librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'],
|
||||
hop_length=hop_length, x_axis="time", y_axis="linear")
|
||||
plt.xlabel("Time", fontsize=label_fontsize)
|
||||
plt.ylabel("Hz", fontsize=label_fontsize)
|
||||
plt.tight_layout()
|
||||
plt.colorbar()
|
||||
|
||||
if spectrogram is not None:
|
||||
plt.subplot(num_plot, 1, 4)
|
||||
librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'],
|
||||
hop_length=hop_length, x_axis="time", y_axis="linear")
|
||||
plt.xlabel("Time", fontsize=label_fontsize)
|
||||
plt.ylabel("Hz", fontsize=label_fontsize)
|
||||
|
||||
if spectrogram2 is not None:
|
||||
plt.subplot(num_plot, 1, 4)
|
||||
librosa.display.specshow(spectrogram2.T, sr=CONFIG.audio['sample_rate'],
|
||||
hop_length=hop_length, x_axis="time", y_axis="linear")
|
||||
plt.xlabel("Time", fontsize=label_fontsize)
|
||||
plt.ylabel("Hz", fontsize=label_fontsize)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.colorbar()
|
||||
|
|
Loading…
Reference in New Issue