diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index c673c963..19fb25be 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -13,6 +13,8 @@ from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy +import mutagen + # to prevent too many open files error as suggested here # https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936 torch.multiprocessing.set_sharing_strategy("file_system") @@ -42,6 +44,15 @@ def string2filename(string): return filename +def get_audio_size(audiopath): + extension = audiopath.rpartition(".")[-1].lower() + if extension not in {"mp3", "wav", "flac"}: + raise RuntimeError(f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!") + + audio_info = mutagen.File(audiopath).info + return int(audio_info.length * audio_info.sample_rate) + + class TTSDataset(Dataset): def __init__( self, @@ -176,7 +187,7 @@ class TTSDataset(Dataset): lens = [] for item in self.samples: _, wav_file, *_ = _parse_sample(item) - audio_len = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio + audio_len = get_audio_size(wav_file) lens.append(audio_len) return lens @@ -295,7 +306,7 @@ class TTSDataset(Dataset): def _compute_lengths(samples): new_samples = [] for item in samples: - audio_length = os.path.getsize(item["audio_file"]) / 16 * 8 # assuming 16bit audio + audio_length = get_audio_size(item["audio_file"]) text_lenght = len(item["text"]) item["audio_length"] = audio_length item["text_length"] = text_lenght diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 83812f37..8e9d6bd3 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -756,11 +756,13 @@ class Xtts(BaseTTS): model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth") vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json") - speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers_xtts.pth") + + if speaker_file_path is None and checkpoint_dir is not None: + speaker_file_path = os.path.join(checkpoint_dir, "speakers_xtts.pth") self.language_manager = LanguageManager(config) self.speaker_manager = None - if os.path.exists(speaker_file_path): + if speaker_file_path is not None and os.path.exists(speaker_file_path): self.speaker_manager = SpeakerManager(speaker_file_path) if os.path.exists(vocab_path): diff --git a/requirements.txt b/requirements.txt index 23e8d2d0..2944e6fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,6 +17,7 @@ pyyaml>=6.0 fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail aiohttp>=3.8.1 packaging>=23.1 +mutagen==1.47.0 # deps for examples flask>=2.0.1 # deps for inference diff --git a/tests/data/ljspeech/metadata_flac.csv b/tests/data/ljspeech/metadata_flac.csv new file mode 100644 index 00000000..43db05ac --- /dev/null +++ b/tests/data/ljspeech/metadata_flac.csv @@ -0,0 +1,9 @@ +audio_file|text|transcription|speaker_name +wavs/LJ001-0001.flac|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0 +wavs/LJ001-0002.flac|in being comparatively modern.|in being comparatively modern.|ljspeech-0 +wavs/LJ001-0003.flac|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1 +wavs/LJ001-0004.flac|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1 +wavs/LJ001-0005.flac|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2 +wavs/LJ001-0006.flac|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2 +wavs/LJ001-0007.flac|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3 +wavs/LJ001-0008.flac|has never been surpassed.|has never been surpassed.|ljspeech-3 \ No newline at end of file diff --git a/tests/data/ljspeech/metadata_mp3.csv b/tests/data/ljspeech/metadata_mp3.csv new file mode 100644 index 00000000..109e48b4 --- /dev/null +++ b/tests/data/ljspeech/metadata_mp3.csv @@ -0,0 +1,9 @@ +audio_file|text|transcription|speaker_name +wavs/LJ001-0001.mp3|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0 +wavs/LJ001-0002.mp3|in being comparatively modern.|in being comparatively modern.|ljspeech-0 +wavs/LJ001-0003.mp3|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1 +wavs/LJ001-0004.mp3|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1 +wavs/LJ001-0005.mp3|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2 +wavs/LJ001-0006.mp3|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2 +wavs/LJ001-0007.mp3|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3 +wavs/LJ001-0008.mp3|has never been surpassed.|has never been surpassed.|ljspeech-3 \ No newline at end of file diff --git a/tests/data/ljspeech/metadata_wav.csv b/tests/data/ljspeech/metadata_wav.csv new file mode 100644 index 00000000..aff73f6d --- /dev/null +++ b/tests/data/ljspeech/metadata_wav.csv @@ -0,0 +1,9 @@ +audio_file|text|transcription|speaker_name +wavs/LJ001-0001.wav|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0 +wavs/LJ001-0002.wav|in being comparatively modern.|in being comparatively modern.|ljspeech-0 +wavs/LJ001-0003.wav|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1 +wavs/LJ001-0004.wav|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1 +wavs/LJ001-0005.wav|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2 +wavs/LJ001-0006.wav|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2 +wavs/LJ001-0007.wav|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3 +wavs/LJ001-0008.wav|has never been surpassed.|has never been surpassed.|ljspeech-3 \ No newline at end of file diff --git a/tests/data/ljspeech/wavs/LJ001-0001.flac b/tests/data/ljspeech/wavs/LJ001-0001.flac new file mode 100644 index 00000000..ed3b009d Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0001.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0001.mp3 b/tests/data/ljspeech/wavs/LJ001-0001.mp3 new file mode 100644 index 00000000..da62c8d7 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0001.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0002.flac b/tests/data/ljspeech/wavs/LJ001-0002.flac new file mode 100644 index 00000000..f6a607ea Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0002.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0002.mp3 b/tests/data/ljspeech/wavs/LJ001-0002.mp3 new file mode 100644 index 00000000..8eb52792 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0002.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0003.flac b/tests/data/ljspeech/wavs/LJ001-0003.flac new file mode 100644 index 00000000..05f357a5 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0003.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0003.mp3 b/tests/data/ljspeech/wavs/LJ001-0003.mp3 new file mode 100644 index 00000000..5bc44498 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0003.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0004.flac b/tests/data/ljspeech/wavs/LJ001-0004.flac new file mode 100644 index 00000000..547e7899 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0004.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0004.mp3 b/tests/data/ljspeech/wavs/LJ001-0004.mp3 new file mode 100644 index 00000000..c68a1680 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0004.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0005.flac b/tests/data/ljspeech/wavs/LJ001-0005.flac new file mode 100644 index 00000000..94589dbb Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0005.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0005.mp3 b/tests/data/ljspeech/wavs/LJ001-0005.mp3 new file mode 100644 index 00000000..99c245b0 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0005.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0006.flac b/tests/data/ljspeech/wavs/LJ001-0006.flac new file mode 100644 index 00000000..87d32d33 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0006.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0006.mp3 b/tests/data/ljspeech/wavs/LJ001-0006.mp3 new file mode 100644 index 00000000..bc6cb81f Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0006.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0007.flac b/tests/data/ljspeech/wavs/LJ001-0007.flac new file mode 100644 index 00000000..7e2b0f1d Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0007.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0007.mp3 b/tests/data/ljspeech/wavs/LJ001-0007.mp3 new file mode 100644 index 00000000..f1e34d1b Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0007.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0008.flac b/tests/data/ljspeech/wavs/LJ001-0008.flac new file mode 100644 index 00000000..6ca201a6 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0008.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0008.mp3 b/tests/data/ljspeech/wavs/LJ001-0008.mp3 new file mode 100644 index 00000000..ede2f068 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0008.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0009.flac b/tests/data/ljspeech/wavs/LJ001-0009.flac new file mode 100644 index 00000000..cd272b5f Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0009.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0009.mp3 b/tests/data/ljspeech/wavs/LJ001-0009.mp3 new file mode 100644 index 00000000..1dd97c48 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0009.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0010.flac b/tests/data/ljspeech/wavs/LJ001-0010.flac new file mode 100644 index 00000000..875e01b0 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0010.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0010.mp3 b/tests/data/ljspeech/wavs/LJ001-0010.mp3 new file mode 100644 index 00000000..a763be3c Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0010.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0011.flac b/tests/data/ljspeech/wavs/LJ001-0011.flac new file mode 100644 index 00000000..3a45005a Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0011.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0011.mp3 b/tests/data/ljspeech/wavs/LJ001-0011.mp3 new file mode 100644 index 00000000..579854e1 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0011.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0012.flac b/tests/data/ljspeech/wavs/LJ001-0012.flac new file mode 100644 index 00000000..2f78f762 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0012.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0012.mp3 b/tests/data/ljspeech/wavs/LJ001-0012.mp3 new file mode 100644 index 00000000..51212f90 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0012.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0013.flac b/tests/data/ljspeech/wavs/LJ001-0013.flac new file mode 100644 index 00000000..50c7707f Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0013.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0013.mp3 b/tests/data/ljspeech/wavs/LJ001-0013.mp3 new file mode 100644 index 00000000..a457bf9c Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0013.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0014.flac b/tests/data/ljspeech/wavs/LJ001-0014.flac new file mode 100644 index 00000000..f8a5fe88 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0014.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0014.mp3 b/tests/data/ljspeech/wavs/LJ001-0014.mp3 new file mode 100644 index 00000000..f4a3d66e Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0014.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0015.flac b/tests/data/ljspeech/wavs/LJ001-0015.flac new file mode 100644 index 00000000..99523288 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0015.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0015.mp3 b/tests/data/ljspeech/wavs/LJ001-0015.mp3 new file mode 100644 index 00000000..f0db88e1 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0015.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0016.flac b/tests/data/ljspeech/wavs/LJ001-0016.flac new file mode 100644 index 00000000..66b7ca95 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0016.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0016.mp3 b/tests/data/ljspeech/wavs/LJ001-0016.mp3 new file mode 100644 index 00000000..cd14b204 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0016.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0017.flac b/tests/data/ljspeech/wavs/LJ001-0017.flac new file mode 100644 index 00000000..56725cce Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0017.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0017.mp3 b/tests/data/ljspeech/wavs/LJ001-0017.mp3 new file mode 100644 index 00000000..ecc9b2a3 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0017.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0018.flac b/tests/data/ljspeech/wavs/LJ001-0018.flac new file mode 100644 index 00000000..ec038cac Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0018.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0018.mp3 b/tests/data/ljspeech/wavs/LJ001-0018.mp3 new file mode 100644 index 00000000..33aa8ba1 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0018.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0019.flac b/tests/data/ljspeech/wavs/LJ001-0019.flac new file mode 100644 index 00000000..6245cc5a Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0019.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0019.mp3 b/tests/data/ljspeech/wavs/LJ001-0019.mp3 new file mode 100644 index 00000000..e1844dce Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0019.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0020.flac b/tests/data/ljspeech/wavs/LJ001-0020.flac new file mode 100644 index 00000000..41598a10 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0020.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0020.mp3 b/tests/data/ljspeech/wavs/LJ001-0020.mp3 new file mode 100644 index 00000000..7a61c050 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0020.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0021.flac b/tests/data/ljspeech/wavs/LJ001-0021.flac new file mode 100644 index 00000000..3ec0eeb3 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0021.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0021.mp3 b/tests/data/ljspeech/wavs/LJ001-0021.mp3 new file mode 100644 index 00000000..45a6d4ce Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0021.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0022.flac b/tests/data/ljspeech/wavs/LJ001-0022.flac new file mode 100644 index 00000000..9db1c6cf Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0022.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0022.mp3 b/tests/data/ljspeech/wavs/LJ001-0022.mp3 new file mode 100644 index 00000000..a0464aa2 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0022.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0023.flac b/tests/data/ljspeech/wavs/LJ001-0023.flac new file mode 100644 index 00000000..621ba660 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0023.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0023.mp3 b/tests/data/ljspeech/wavs/LJ001-0023.mp3 new file mode 100644 index 00000000..a6b087f8 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0023.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0024.flac b/tests/data/ljspeech/wavs/LJ001-0024.flac new file mode 100644 index 00000000..4125d10b Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0024.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0024.mp3 b/tests/data/ljspeech/wavs/LJ001-0024.mp3 new file mode 100644 index 00000000..0fee298f Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0024.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0025.flac b/tests/data/ljspeech/wavs/LJ001-0025.flac new file mode 100644 index 00000000..ee0c4b6e Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0025.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0025.mp3 b/tests/data/ljspeech/wavs/LJ001-0025.mp3 new file mode 100644 index 00000000..f8c13a10 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0025.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0026.flac b/tests/data/ljspeech/wavs/LJ001-0026.flac new file mode 100644 index 00000000..119f26fb Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0026.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0026.mp3 b/tests/data/ljspeech/wavs/LJ001-0026.mp3 new file mode 100644 index 00000000..fed88cc9 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0026.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0027.flac b/tests/data/ljspeech/wavs/LJ001-0027.flac new file mode 100644 index 00000000..ff685ca5 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0027.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0027.mp3 b/tests/data/ljspeech/wavs/LJ001-0027.mp3 new file mode 100644 index 00000000..bc23ed31 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0027.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0028.flac b/tests/data/ljspeech/wavs/LJ001-0028.flac new file mode 100644 index 00000000..151334f6 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0028.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0028.mp3 b/tests/data/ljspeech/wavs/LJ001-0028.mp3 new file mode 100644 index 00000000..02124033 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0028.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0029.flac b/tests/data/ljspeech/wavs/LJ001-0029.flac new file mode 100644 index 00000000..65586b6c Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0029.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0029.mp3 b/tests/data/ljspeech/wavs/LJ001-0029.mp3 new file mode 100644 index 00000000..f20eb0df Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0029.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0030.flac b/tests/data/ljspeech/wavs/LJ001-0030.flac new file mode 100644 index 00000000..411553c1 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0030.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0030.mp3 b/tests/data/ljspeech/wavs/LJ001-0030.mp3 new file mode 100644 index 00000000..7d46fbef Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0030.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0031.flac b/tests/data/ljspeech/wavs/LJ001-0031.flac new file mode 100644 index 00000000..b9f4fa68 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0031.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0031.mp3 b/tests/data/ljspeech/wavs/LJ001-0031.mp3 new file mode 100644 index 00000000..6842943c Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0031.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0032.flac b/tests/data/ljspeech/wavs/LJ001-0032.flac new file mode 100644 index 00000000..9166a9d5 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0032.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0032.mp3 b/tests/data/ljspeech/wavs/LJ001-0032.mp3 new file mode 100644 index 00000000..cf5abb64 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0032.mp3 differ diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index cbd98fc0..ce873876 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -21,15 +21,30 @@ os.makedirs(OUTPATH, exist_ok=True) c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False) c.r = 5 c.data_path = os.path.join(get_tests_data_path(), "ljspeech/") -ok_ljspeech = os.path.exists(c.data_path) -dataset_config = BaseDatasetConfig( - formatter="ljspeech_test", # ljspeech_test to multi-speaker - meta_file_train="metadata.csv", +dataset_config_wav = BaseDatasetConfig( + formatter="coqui", # ljspeech_test to multi-speaker + meta_file_train="metadata_wav.csv", meta_file_val=None, path=c.data_path, language="en", ) +dataset_config_mp3 = BaseDatasetConfig( + formatter="coqui", # ljspeech_test to multi-speaker + meta_file_train="metadata_mp3.csv", + meta_file_val=None, + path=c.data_path, + language="en", +) +dataset_config_flac = BaseDatasetConfig( + formatter="coqui", # ljspeech_test to multi-speaker + meta_file_train="metadata_flac.csv", + meta_file_val=None, + path=c.data_path, + language="en", +) + +dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac] DATA_EXIST = True if not os.path.exists(c.data_path): @@ -44,11 +59,10 @@ class TestTTSDataset(unittest.TestCase): self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) - def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): + def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): # load dataset meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) items = meta_data_train + meta_data_eval - tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, @@ -64,6 +78,11 @@ class TestTTSDataset(unittest.TestCase): max_audio_len=c.max_audio_len, start_by_longest=start_by_longest, ) + + # add preprocess to force the length computation + if preprocess_samples: + dataset.preprocess_samples() + dataloader = DataLoader( dataset, batch_size=batch_size, @@ -75,9 +94,8 @@ class TestTTSDataset(unittest.TestCase): return dataloader, dataset def test_loader(self): - if ok_ljspeech: - dataloader, dataset = self._create_dataloader(1, 1, 0) - + for dataset_config in dataset_configs: + dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break @@ -104,8 +122,6 @@ class TestTTSDataset(unittest.TestCase): # make sure that the computed mels and the waveform match and correctly computed mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy()) - # remove padding in mel-spectrogram - mel_dataloader = mel_input[0].T.numpy()[:, : mel_lengths[0]] # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding mel_new = mel_new[:, : mel_lengths[0]] ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) @@ -124,40 +140,38 @@ class TestTTSDataset(unittest.TestCase): self.assertGreaterEqual(mel_input.min(), 0) def test_batch_group_shuffle(self): - if ok_ljspeech: - dataloader, dataset = self._create_dataloader(2, c.r, 16) - last_length = 0 - frames = dataset.samples - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - avg_length = mel_lengths.numpy().mean() - dataloader.dataset.preprocess_samples() - is_items_reordered = False - for idx, item in enumerate(dataloader.dataset.samples): - if item != frames[idx]: - is_items_reordered = True - break - self.assertGreaterEqual(avg_length, last_length) - self.assertTrue(is_items_reordered) + dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav) + last_length = 0 + frames = dataset.samples + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + mel_lengths = data["mel_lengths"] + avg_length = mel_lengths.numpy().mean() + dataloader.dataset.preprocess_samples() + is_items_reordered = False + for idx, item in enumerate(dataloader.dataset.samples): + if item != frames[idx]: + is_items_reordered = True + break + self.assertGreaterEqual(avg_length, last_length) + self.assertTrue(is_items_reordered) def test_start_by_longest(self): """Test start_by_longest option. Ther first item of the fist batch must be longer than all the other items. """ - if ok_ljspeech: - dataloader, _ = self._create_dataloader(2, c.r, 0, True) - dataloader.dataset.preprocess_samples() - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - if i == 0: - max_len = mel_lengths[0] - print(mel_lengths) - self.assertTrue(all(max_len >= mel_lengths)) + dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) + dataloader.dataset.preprocess_samples() + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + mel_lengths = data["mel_lengths"] + if i == 0: + max_len = mel_lengths[0] + print(mel_lengths) + self.assertTrue(all(max_len >= mel_lengths)) def test_padding_and_spectrograms(self): def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): @@ -172,71 +186,70 @@ class TestTTSDataset(unittest.TestCase): self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0]) self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0]) - if ok_ljspeech: - dataloader, _ = self._create_dataloader(1, 1, 0) + dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav) - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] - # check mel_spec consistency - wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - mel = torch.FloatTensor(mel).contiguous() - mel_dl = mel_input[0] - # NOTE: Below needs to check == 0 but due to an unknown reason - # there is a slight difference between two matrices. - # TODO: Check this assert cond more in detail. - self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) + # check mel_spec consistency + wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) + mel = self.ap.melspectrogram(wav).astype("float32") + mel = torch.FloatTensor(mel).contiguous() + mel_dl = mel_input[0] + # NOTE: Below needs to check == 0 but due to an unknown reason + # there is a slight difference between two matrices. + # TODO: Check this assert cond more in detail. + self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) - # check mel-spec correctness - mel_spec = mel_input[0].cpu().numpy() - wav = self.ap.inv_melspectrogram(mel_spec.T) - self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") + # check mel-spec correctness + mel_spec = mel_input[0].cpu().numpy() + wav = self.ap.inv_melspectrogram(mel_spec.T) + self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") + shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") - # check linear-spec - linear_spec = linear_input[0].cpu().numpy() - wav = self.ap.inv_spectrogram(linear_spec.T) - self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") + # check linear-spec + linear_spec = linear_input[0].cpu().numpy() + wav = self.ap.inv_spectrogram(linear_spec.T) + self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") + shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") - # check the outputs - check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) + # check the outputs + check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) - # Test for batch size 2 - dataloader, _ = self._create_dataloader(2, 1, 0) + # Test for batch size 2 + dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav) - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] - # set id to the longest sequence in the batch - if mel_lengths[0] > mel_lengths[1]: - idx = 0 - else: - idx = 1 + # set id to the longest sequence in the batch + if mel_lengths[0] > mel_lengths[1]: + idx = 0 + else: + idx = 1 - # check the longer item in the batch - check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) + # check the longer item in the batch + check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) - # check the other item in the batch - self.assertEqual(linear_input[1 - idx, -1].sum(), 0) - self.assertEqual(mel_input[1 - idx, -1].sum(), 0) - self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) - self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1]) - self.assertEqual(len(mel_lengths.shape), 1) + # check the other item in the batch + self.assertEqual(linear_input[1 - idx, -1].sum(), 0) + self.assertEqual(mel_input[1 - idx, -1].sum(), 0) + self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) + self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1]) + self.assertEqual(len(mel_lengths.shape), 1) - # check batch zero-frame conditions (zero-frame disabled) - # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 - # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 + # check batch zero-frame conditions (zero-frame disabled) + # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 + # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0