Compare commits

...

548 Commits

Author SHA1 Message Date
Nick Potafiy dbf1a08a0d
Update generic_utils.py (#3561)
Handles cases when git branch produces no output or invalid output. Right now, it just crashes with `StopIteration`
2024-02-10 11:20:58 -03:00
Edresson Casanova 5dcc16d193
Bug fix in MP3 and FLAC compute length on TTSDataset (#3092)
* Bug Fix on XTTS load

* Bug fix in MP3 length on TTSDataset

* Update TTS/tts/datasets/dataset.py

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Uses mutagen for all audio formats

* Add dataloader test wit hall supported audio formats

* Use mutagen.File

* Update

* Fix aux unit tests

* Bug fixe on unit tests

---------

Co-authored-by: Aarni Koskela <akx@iki.fi>
2023-12-27 13:23:43 -03:00
Eren Gölge 55c7063724
Merge pull request #3423 from idiap/fix-aux-tests
Fix CI (save best model after 0 steps in tests)
2023-12-14 18:00:30 +01:00
Enno Hermann 99fee6f5ad build: use Trainer>=0.0.36 2023-12-14 14:26:31 +01:00
Eren Gölge 186cafb34c
Merge pull request #3412 from coqui-ai/reuben/docs-studio-refs
Remove Coqui Studio references
2023-12-13 08:54:57 +01:00
Eren Gölge 3991d83b2c
Merge branch 'dev' into reuben/docs-studio-refs 2023-12-13 08:53:43 +01:00
Eren Gölge fa28f99f15
Update to v0.22.0 2023-12-12 16:10:46 +01:00
Eren Gölge 8c1a8b522b
Merge pull request #3405 from coqui-ai/studio_speakers
Add studio speakers to open source XTTS!
2023-12-12 16:10:09 +01:00
Reuben Morais 0859e9f252
Remove Coqui Studio references 2023-12-12 16:09:57 +01:00
Enno Hermann 9f325b1f6c fixup! Fix aux unit tests 2023-12-12 16:07:16 +01:00
Edresson Casanova fc099218df Fix aux unit tests 2023-12-12 16:07:16 +01:00
Eren Gölge 934b87bbd1
Merge pull request #3391 from aaron-lii/multi-gpu
support multiple GPU training for XTTS
2023-12-12 13:51:26 +01:00
Eren Gölge b0fe0e678d
Merge pull request #3392 from joelhoward0/fix_contributing_typo
fixes a typo
2023-12-12 13:50:59 +01:00
Eren Gölge 936084be7e
Merge pull request #3404 from freds0/dev
Training fastspeech2 with External Speaker Embeddings
2023-12-12 13:50:27 +01:00
Eren Gölge 8e6a7cbfbf
Update .models.json 2023-12-12 13:50:01 +01:00
Eren Gölge 8999780aff
Update test_models.py 2023-12-12 13:30:21 +01:00
Eren Gölge 4dc0722bbc
Update .models.json 2023-12-12 13:28:16 +01:00
Edresson Casanova 4b33699b41 Update docs 2023-12-12 09:22:07 -03:00
Edresson Casanova b6e1ac66d9 Add docs 2023-12-12 09:19:56 -03:00
WeberJulian 61b67ef16f Fix read_json_with_comments 2023-12-11 23:58:52 +01:00
WeberJulian d47b6df4e5 Make comments in .model.json valid 2023-12-11 23:35:27 +01:00
WeberJulian 605a857add Remove tortoise 2023-12-11 23:35:07 +01:00
WeberJulian b40750d1f5 Remove models that require app.coqui.ai 2023-12-11 23:17:54 +01:00
WeberJulian ecc38891fb Fix CI readme 2023-12-11 23:01:30 +01:00
WeberJulian 5ab228dff2 Fix CI 2023-12-11 22:31:53 +01:00
WeberJulian 8c20a599d8 Remove coqui studio integration from TTS 2023-12-11 22:11:46 +01:00
WeberJulian 5cd750ac7e Fix API and CI 2023-12-11 20:21:53 +01:00
WeberJulian e3c9dab7a3 Make CLI work 2023-12-11 18:49:18 +01:00
WeberJulian 0a90359a42 rename speaker file 2023-12-11 18:48:49 +01:00
WeberJulian a5c0d9780f rename manager 2023-12-11 18:48:31 +01:00
WeberJulian 36143fee26 Add basic speaker manager 2023-12-11 15:25:46 +01:00
Frederico S. Oliveira f9117918fe
Update .models.json 2023-12-11 10:47:31 -03:00
Frederico S. Oliveira 163f9a3fdf
Merge branch 'coqui-ai:dev' into dev 2023-12-11 10:04:07 -03:00
WeberJulian 0a136a8535 Download speaker file 2023-12-11 11:29:36 +01:00
joelhoward0 e535cfe07c fixes a typo 2023-12-08 14:19:57 +00:00
Aaron-Li b6e929696a support multiple GPU training 2023-12-08 16:55:32 +08:00
Eren Gölge c99e885cc8
Merge pull request #3373 from coqui-ai/add-doc-xtts
Add inference parameters
2023-12-07 14:07:28 +01:00
Eren Gölge 4b35a1e756
Merge pull request #3381 from JRMeyer/licensing-message
Print message for either commercial license or CPML
2023-12-07 13:57:39 +01:00
Josh Meyer 759d9ab3ae
Print message for either commercial license or CPML 2023-12-07 13:54:48 +01:00
Eren Gölge 6b2ba527fa
Merge pull request #3368 from omahs/patch-1
Fix typos
2023-12-06 15:10:14 +01:00
WeberJulian 7d1a6defd6 Add inference parameters 2023-12-06 11:43:31 +01:00
omahs f659fa16bc
fix typo 2023-12-05 09:50:33 +01:00
omahs 716657c835
fix typos 2023-12-05 09:48:03 +01:00
omahs 775a9138b7
fix typo 2023-12-05 09:47:07 +01:00
omahs cfb143b9fb
fix typos 2023-12-05 09:46:36 +01:00
omahs c03fe7377b
fix typos 2023-12-05 09:45:00 +01:00
omahs bba21b86c6
fix typo 2023-12-05 09:41:23 +01:00
Eren Gölge e49c512d99
Merge pull request #3351 from aaron-lii/chinese-puncs
fix pause problem of Chinese speech
2023-12-04 15:57:42 +01:00
Eren Gölge 9c7b850995
Merge pull request #3352 from VladCuciureanu/patch-1
fix: Few typos in Tortoise docs.
2023-12-04 15:56:37 +01:00
Eren Gölge 2d02015978
Update to v0.21.3 2023-12-01 23:52:57 +01:00
Edresson Casanova 5f900f156a
Add XTTS Fine tuning gradio demo (#3296)
* Add XTTS FT demo data processing pipeline

* Add training and inference columns

* Uses tabs instead of columns

* Fix demo freezing issue

* Update demo

* Convert stereo to mono

* Bug fix on XTTS inference

* Update gradio demo

* Update gradio demo

* Update gradio demo

* Update gradio demo

* Add parameters to be able to set then on colab demo

* Add erros messages

* Add intuitive error messages

* Update

* Add max_audio_length parameter

* Add XTTS fine-tuner docs

* Update XTTS finetuner docs

* Delete trainer to freeze memory

* Delete unused variables

* Add gc.collect()

* Update xtts.md

---------

Co-authored-by: Eren Gölge <erogol@hotmail.com>
2023-12-01 23:52:23 +01:00
Vlad Cuciureanu f5b41674e8
fix: Few typos in Tortoise docs. 2023-12-01 20:42:41 +02:00
Aaron-Li 7b8808186a fix pause problem of Chinese speech 2023-12-01 23:30:03 +08:00
Frederico S. Oliveira bcd500fa7b Fixing bug
Correction in training the Fastspeech/Fastspeech2/FastPitch/SpeedySpeech model using external speaker embedding.
2023-11-30 17:27:05 -03:00
Frederico S. Oliveira a26e51b0b4
Merge branch 'coqui-ai:dev' into dev 2023-11-30 14:19:05 -03:00
Eren Gölge 6d1905c2b7
Update to v0.21.2 2023-11-30 13:05:10 +01:00
Hannes Krumbiegel e40527b103
Fix link to installation instructions (#3329) 2023-11-30 13:03:33 +01:00
Enno Hermann 39321d02be
fix: correctly strip/restore initial punctuation (#3336)
* refactor(punctuation): remove orphan code for handling lone punctuation

The case of lone punctuation is already handled at the top of restore(). The
removed if statement would never be called and would in fact raise an
AttributeError because the _punc_index named tuple doesn't have the attribute
`mark`.

* refactor(punctuation): remove unused argument

* fix(punctuation): correctly handle initial punctuation

Stripping and restoring initial punctuation didn't work correctly because the
string-splitting caused an additional empty string to be inserted in the text
list (because `".A".split(".")` => `["", "A"]`). Now, an initial empty string is
skipped and relevant test cases are added.

Fixes #3333
2023-11-30 13:03:16 +01:00
Eren Gölge 93283385e0
Merge pull request #3318 from coqui-ai/calling_hf_models
Run XTTS models by direct name with versions
2023-11-30 13:02:26 +01:00
Frederico S. Oliveira 77c2155609
Merge pull request #1 from coqui-ai/dev
Update
2023-11-29 17:24:02 -03:00
Eren G??lge bfbaffc84a Fixup 2023-11-28 13:47:45 +01:00
Eren G??lge 18b7d746cb Updating XTTS docs 2023-11-27 14:54:49 +01:00
Eren G??lge b75e90ba85 Make text splitting optional 2023-11-27 14:53:11 +01:00
Eren G??lge 3b8894a3dd Make style 2023-11-27 14:15:50 +01:00
Eren G??lge 2fd8cf3d94 Make xtts runnable by version names 2023-11-27 14:15:16 +01:00
Eren G??lge 11ec9f7471 Add hi in config defaults 2023-11-24 15:38:36 +01:00
Eren G??lge 00a870c26a Update to v0.21.1 2023-11-24 15:15:44 +01:00
Eren G??lge 7e575068c9 Merge branch 'dev' of https://github.com/coqui-ai/TTS into dev 2023-11-24 15:15:19 +01:00
Eren G??lge 32065139e7 Simple text cleaner for "hi" 2023-11-24 15:14:34 +01:00
Eren Gölge 1542a50c3a
Update to v0.21.0 2023-11-24 14:37:05 +01:00
Eren G??lge 6dd43b0ce2 Update to XTTS v2.0.3 2023-11-24 14:36:04 +01:00
Julian Weber a55755c8df
update deepspeed version (#3281) 2023-11-24 12:35:49 +01:00
Kaszanas 1bf5926196
Introducing Development Dockerfile (#3263)
* Moved Dockerfile, COPY at the end

This change should prevent re-installation of the dependencies upon
every change of the repository's contents. Typically if Docker detects
that something changed in a layer, all downstream layers are invalidated
and rebuilt.

* Moved Dockerfile back to main directory

Main dockerfile in a separate directory can cause issues with the
current CI/CD setup. This can be a good change for later.

* Introduced Dockerfile.dev, updated CONTRIBUTING

Dockerfile.dev can be used as a separate development environment for
anyone that does not wish to install the dependencies locally.
2023-11-24 12:30:15 +01:00
TITC 4d0f53d2ee
Misjudgment of `is_multi_lingual` When Loading Multilingual Model via `model_path` (#3273)
* load multilingual model by path

* use config to assert multi lingual or not
2023-11-24 12:28:31 +01:00
Enno Hermann 8c5227ed84
Fix tts_with_vc (#3275)
* Revert "fix for issue 3067"

This reverts commit 041b4b6723.

Fixes #3143. The original issue (#3067) was people trying to use
tts.tts_with_vc_to_file() with XTTS and was "fixed" in #3109. But XTTS has
integrated VC and you can just do tts.tts_to_file(..., speaker_wav="..."), there
is no point in passing it through FreeVC afterwards. So, reverting this commit
because it breaks tts.tts_with_vc_to_file() for any model that doesn't have
integrated VC, i.e. all models this method is meant for.

* fix: support multi-speaker models in tts_with_vc/tts_with_vc_to_file

* fix: only compute spk embeddings for models that support it

Fixes #1440. Passing a `speaker_wav` argument to regular Vits models failed
because they don't support voice cloning. Now that argument is simply ignored.
2023-11-24 12:26:37 +01:00
Enno Hermann 2af0220996
fix: don't pass quotes to espeak (#3286)
Previously, the text was wrapped in an additional set of quotes that was passed
to Espeak. This could result in different phonemization in certain edges and
caused the insertion of an initial separator "_" that had to be removed.
Compare:
$ espeak-ng -q -b 1 -v en-us --ipa=1 '"A"'
_ˈɐ
$ espeak-ng -q -b 1 -v en-us --ipa=1 'A'
ˈeɪ

Fixes #2619
2023-11-24 12:25:37 +01:00
Enno Hermann 4a2684be34
fix(bin.synthesize): more informative error for wrong --language argument (#3294)
In multilingual models, the target language is specified via the
`--language_idx` argument. However, the `tts` CLI also accepts a `--language`
argument for use with Coqui Studio, so it is easy to choose the wrong one,
resulting in the following confusing error at synthesis time:

```
AssertionError:   Language None is not supported. Supported languages are
['en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar',
'zh-cn', 'hu', 'ko', 'ja']
```

This commit adds a better error message when `--language` is passed for a
non-studio model.

Fixes #3270, fixes #3291
2023-11-24 12:24:42 +01:00
Tessa Painter 64f391b583
Made the tqdm `progress_bar` objects of static download methods a static class variable (#3297) 2023-11-24 12:23:59 +01:00
Eren Gölge b47d9c6e36
Merge pull request #3243 from idiap/checkpoints
Remove duplicate/unused code
2023-11-22 23:52:06 +01:00
Eren Gölge 29dede20d3
Merge pull request #3249 from coqui-ai/run_ci_for_v0.20.6
Run CI for v0.20.6
2023-11-17 15:45:26 +01:00
Eren Gölge c011ab7455 Update to v0.20.6 2023-11-17 15:16:32 +01:00
Eren G??lge 52cb1e2f68 Update model hash for v2.0.2 2023-11-17 15:16:32 +01:00
Edresson Casanova 6075fa208c Ensures that only GPT model is in training mode during XTTS GPT training (#3241)
* Ensures that only GPT model is in training mode during training

* Fix parallel wavegan unit test
2023-11-17 15:15:22 +01:00
Eren G??lge a3279f9294 Make style 2023-11-17 15:15:22 +01:00
Eren G??lge f21067a84a Make k_diffusion optional 2023-11-17 15:15:21 +01:00
Eren G??lge 44494daa27 Update CI version 2023-11-17 15:15:21 +01:00
Eren G??lge c864acf2b7 Update versions 2023-11-17 15:15:21 +01:00
Edresson Casanova 11283fce07
Ensures that only GPT model is in training mode during XTTS GPT training (#3241)
* Ensures that only GPT model is in training mode during training

* Fix parallel wavegan unit test
2023-11-17 15:13:46 +01:00
Eren Gölge 14579a4607
Merge pull request #3248 from coqui-ai/slacker_deps
Update versions
2023-11-17 15:13:19 +01:00
Eren G??lge 44880f09ed Make style 2023-11-17 13:43:34 +01:00
Eren G??lge 26efdf6ee7 Make k_diffusion optional 2023-11-17 13:42:33 +01:00
Eren G??lge 08d11e9198 Update CI version 2023-11-17 13:01:32 +01:00
Eren G??lge 63d7145647 Update versions 2023-11-17 12:10:46 +01:00
Enno Hermann 0fb0d67de7 refactor: use save_checkpoint()/save_best_model() from Trainer 2023-11-17 01:18:23 +01:00
Enno Hermann 96678c7ba2 refactor: use copy_model_files() from Trainer 2023-11-17 01:18:23 +01:00
Enno Hermann 5119e651a1 chore(utils.io): remove unused code
These are all available in Trainer.
2023-11-17 01:18:23 +01:00
Enno Hermann 39fe38bda4 refactor: use save_fsspec() from Trainer 2023-11-17 01:18:23 +01:00
Enno Hermann fdf0c8b10a chore(encoder): remove unused code 2023-11-17 01:18:23 +01:00
Eren Gölge 7e4375da2b
Update to v0.20.6 2023-11-16 17:52:13 +01:00
Julian Weber fbc18b8c34
Fix zh bug (#3238) 2023-11-16 17:51:37 +01:00
Julian Weber 675f983550
Add sentence splitting (#3227)
* Add sentence spliting

* update requirements

* update default args v2

* Add spanish

* Fix return gpt_latents

* Update requirements

* Fix requirements
2023-11-16 11:01:11 +01:00
Enno Hermann 3c2d5a9e03
Remove duplicate AudioProcessor code and fix ExtractTTSpectrogram.ipynb (#3230)
* chore: remove unused argument

* refactor(audio.processor): remove duplicate stft+griffin_lim

* chore(audio.processor): remove unused compute_stft_paddings

Same function available in numpy_transforms

* refactor(audio.processor): remove duplicate db_to_amp

* refactor(audio.processor): remove duplicate amp_to_db

* refactor(audio.processor): remove duplicate linear_to_mel

* refactor(audio.processor): remove duplicate mel_to_linear

* refactor(audio.processor): remove duplicate build_mel_basis

* refactor(audio.processor): remove duplicate stft_parameters

* refactor(audio.processor): use pre-/deemphasis from numpy_transforms

* refactor(audio.processor): use rms_volume_norm from numpy_transforms

* chore(audio.processor): remove duplicate assert

Already checked in numpy_transforms.compute_f0

* refactor(audio.processor): use find_endpoint from numpy_transforms

* refactor(audio.processor): use trim_silence from numpy_transforms

* refactor(audio.processor): use volume_norm from numpy_transforms

* refactor(audio.processor): use load_wav from numpy_transforms

* fix(bin.extract_tts_spectrograms): set quantization bits

* fix(ExtractTTSpectrogram.ipynb): adapt to current TTS code

Fixes #2447, #2574

* refactor(audio.processor): remove duplicate quantization methods
2023-11-16 10:57:06 +01:00
Eren Gölge 88630c60e5
Update to v0.20.5 2023-11-15 14:02:51 +01:00
Edresson Casanova 73a5bd08c0
Fix XTTS GPT padding and inference issues (#3216)
* Fix end artifact for fine tuning models

* Bug fix on zh-cn inference

* Remove ununsed code
2023-11-15 14:02:05 +01:00
Ikko Eltociear Ashimine 15f0ac57d6
Update README.md (#3215)
Dicord -> Discord
2023-11-15 13:59:56 +01:00
Julian Weber 04901fb2e4
Add speed control for inference (#3214)
* Add speed control for inference

* Fix XTTS tests

* Add speed control tests
2023-11-14 16:07:17 +01:00
Eren Gölge d96f3885d5
Update to v0.20.4 2023-11-13 17:07:25 +01:00
Eren Gölge ac3df409a6
Merge pull request #3208 from coqui-ai/fix_max_mel_len
fix max generation length for XTTS
2023-11-13 14:32:56 +01:00
Eren Gölge f32a465711
Merge pull request #3207 from coqui-ai/update_xtts_cloning
Update XTTS cloning
2023-11-13 14:32:43 +01:00
Eren G??lge 92fa988aec Fixup 2023-11-13 13:44:06 +01:00
WeberJulian b85536b23f fix max generation length 2023-11-13 13:18:45 +01:00
Eren G??lge b2682d39c5 Make style 2023-11-13 13:01:01 +01:00
Eren G??lge a16360af85 Implement chunking gpt_cond 2023-11-13 13:00:08 +01:00
Eren Gölge 6f1cba2f81
Update to v0.20.3 2023-11-09 17:41:37 +01:00
Enno Hermann 3b1e7038bc
fix(formatters): set missing root_path attribute (#3182)
Fixes #2778
2023-11-09 16:49:52 +01:00
Aarni Koskela a8e9163fb3
xtts/tokenizer: merge duplicate implementations of preprocess_text (#3170)
This was found via ruff:

> F811 Redefinition of unused `preprocess_text` from line 570
2023-11-09 16:32:12 +01:00
Matthew Boakes 1b9c400bca
PyTorch 2.1 Updates (Weight Norm and TorchAudio I/O) (#3176)
* Replaced PyTorch weight_norm With parametrizations.weight_norm

* TorchAudio: Migrating The I/O Functions To Use The Dispatcher Mechanism

* Corrected Code Style

---------

Co-authored-by: Eren Gölge <erogol@hotmail.com>
2023-11-09 16:31:03 +01:00
Gorkem 66a1e248d0
torchaudio should use proper backend to load audio (#3179) 2023-11-09 16:28:39 +01:00
Eren Gölge 46d9c27212
Update to v0.20.2 2023-11-08 16:07:56 +01:00
Julian Weber 58cb0d8dd0
Remove v1 doc and tests (#3172)
* remove v1 in inference.md

* remove v1 in README.md

* Update test_models.py
2023-11-08 14:51:42 +01:00
Julian Weber 03ad90135b
Add lang code in XTTS doc (#3158)
* Add lang code in XTTS doc

* Remove ununsed config and args

* update docs

* woops
2023-11-08 13:47:33 +01:00
Gorkem 78a596618a
Fix for exception on streaming if last chunk empty (#3160) 2023-11-08 11:32:02 +01:00
Enno Hermann 99edd6daa3
Fix ModelManager.list_models() (#3128)
* fix(utils.manage): remove hard-coded model_type variable

* refactor(utils.manage): address lint issues, fix typos

Addressed the following:
TTS/utils/manage.py:307:12: R1705: Unnecessary "else" after "return" (no-else-return)
TTS/utils/manage.py:308:21: W1514: Using open without explicitly specifying an encoding (unspecified-encoding)
TTS/utils/manage.py:299:4: R1710: Either all return statements in a function should return an expression, or none of them should. (inconsistent-return-statements)
TTS/utils/manage.py:299:4: R0201: Method could be a function (no-self-use)
TTS/utils/manage.py:314:4: R0201: Method could be a function (no-self-use)
2023-11-08 11:29:01 +01:00
Eren Gölge 77b18126c7
Merge pull request #3126 from akx/freevc-config-module
Move FreeVCConfig to TTS.vc.configs (like all other config classes)
2023-11-08 11:24:47 +01:00
Eren Gölge cc6e9fcaa7
Fix #3153 (#3169) 2023-11-08 11:13:58 +01:00
Eren Gölge a24ebcd8a6
Fix coqui api (#3168) 2023-11-08 10:51:23 +01:00
Julian Weber ce1a39a9a4
Add char limit warn (#3130)
* Add char limit warning

* Adding v2 langs

* cached_property for cutlet

* Fix import
2023-11-08 10:24:23 +01:00
Eren Gölge f846a9f300
Update to v0.20.1 2023-11-07 14:17:36 +01:00
Edresson Casanova cbdbc44e0f
Fix XTTS v2.0 training recipe (#3154)
* Fix XTTS v2.0 training recipe

* Update XTTS v2 model hash
2023-11-07 14:16:44 +01:00
Eren Gölge 5e992d8704
Merge pull request #3149 from coqui-ai/fixup_xtts_v2
Bug fixes and add support for multiples speaker references on XTTS inference
2023-11-07 10:36:20 +01:00
Edresson Casanova 5f9ab6cfaa
Fix style
Co-authored-by: Aarni Koskela <akx@iki.fi>
2023-11-06 19:22:34 -03:00
Edresson Casanova 905900afc9 Update XTTS v1.1 recipe 2023-11-06 19:14:50 -03:00
Edresson Casanova 2470599d18 Drop XTTS v1 2023-11-06 19:12:04 -03:00
Edresson Casanova 13243df526 Update XTTS v1.1 files 2023-11-06 19:10:21 -03:00
Edresson Casanova cabff9f323 Update XTTS v2.0 recipe 2023-11-06 17:47:14 -03:00
Edresson Casanova 09fb317e6d Remove unused code 2023-11-06 17:36:32 -03:00
Edresson Casanova b146de4ce8 Bug fix on XTTS v2.0 Trainer 2023-11-06 20:26:01 +01:00
Edresson Casanova f444f296f2 Add multiples references on xtts inference tests 2023-11-06 20:25:06 +01:00
Edresson Casanova 1b6f8d0e46 Update unit tests and recipes 2023-11-06 20:25:06 +01:00
Edresson Casanova 72b2bac0f8 Load reference in 24khz to avoid issued with multiple sr references 2023-11-06 20:25:06 +01:00
Edresson Casanova 00294ffdf6 Update XTTS docs 2023-11-06 20:24:06 +01:00
Edresson Casanova 459ad70dc8 Add support for multiples speaker references on XTTS inference 2023-11-06 20:22:35 +01:00
Edresson Casanova 9942000c50 Update XTTS v2 recipe model files 2023-11-06 20:20:28 +01:00
Eren Gölge f0cb19ecca
Drop diffusion from XTTS (#3150)
* Drop diffusion for XTTS

* Make style

* Drop diffusion deps in code

* Restore thrashed
2023-11-06 20:15:49 +01:00
Eren G??lge 5d418bb84a Update docs 2023-11-06 18:48:41 +01:00
Eren G??lge 9bbf6eb8dd Drop use_ne_hifigan 2023-11-06 18:43:38 +01:00
Eren G??lge 9d54bd7655 Fixup XTTS 2023-11-06 18:13:58 +01:00
Eren Gölge c713a839da
Update VERSION 2023-11-06 15:51:56 +01:00
Eren Gölge 7eedfc67da
Update README.md 2023-11-06 15:37:32 +01:00
Edresson Casanova e45227d9ff
XTTS v2.0 (#3137)
* Implement most similar ref training approach

* Use non-enhanced hifigan for test samples

* Add Perceiver

* Update GPT Trainer for perceiver support

* Update XTTS docs

* Bug fix masking with XTTS perceiver

* Bug fix on gpt forward

* Bug Fix on XTTS v2.0 training

* Add XTTS v2.0 unit tests

* Add XTTS v2.0 inference unit tests

* Bug Fix on diffusion inference

* Add XTTS v2.0 training recipe

* Placeholder model entry

* Add cloning params to config

* Make prompt embedding configurable

* Make cloning configurable

* Cheap fix for a cheaper fix

* Prevent resampling

* Update model entry

* Update docs

* Update requirements

* Code linting

* Add xtts v2 to sep tests

* Bug fix on XTTS get_gpt_cond_latents

* Bug fix on rebase

* Make style

* Bug fix in Japenese tokenizer

* Add num2words to deps

* Remove unused kwarg and added num_beams=1 as default

---------

Co-authored-by: Eren G??lge <egolge@coqui.ai>
2023-11-06 14:58:18 +01:00
Aarni Koskela 38f6f8f0bb
Run `make style` & re-enable it in CI (#3127) 2023-11-06 11:36:37 +01:00
Aarni Koskela 5ae369d629 Move FreeVCConfig to TTS.vc.configs (like all other config classes) 2023-10-31 16:56:25 +02:00
Eren Gölge 6fef4f9067
Bump up to v0.19.1 2023-10-30 10:37:28 +01:00
Eren Gölge eccc94be9b
Merge pull request #2983 from vltmedia/dev
Bug: self.model_name needed to be initialized.
2023-10-28 10:39:25 +02:00
Eren Gölge 2d6bd716ef
Merge pull request #3109 from coqui-ai/tts_3067
fix for issue 3067
2023-10-28 10:37:52 +02:00
Eren Gölge 788959d720
Merge pull request #3103 from coqui-ai/fix_xttsv1.1_again
Second round of issue fixing for XTTS v1.1
2023-10-28 10:33:19 +02:00
WeberJulian 1c98821359 Remove unused load_audio function 2023-10-27 22:27:18 +02:00
Aya Jafari 041b4b6723 fix for issue 3067 2023-10-26 13:06:01 -03:00
WeberJulian d4e08c8d6c Add features to get_conditioning_latents 2023-10-26 14:57:33 +02:00
WeberJulian c1133724a1 Move lang token add to tokenizer 2023-10-26 14:52:13 +02:00
WeberJulian 6fa46d197d Fix get_conditioning_latents when using only ne 2023-10-26 14:51:35 +02:00
Eren Gölge edd3a28723
Bump up to v0.19.0 2023-10-25 13:29:38 +02:00
Eren Gölge 16ba377f61
Merge pull request #3086 from coqui-ai/xtts_trainer
XTTS v1.1 GPT Trainer
2023-10-25 13:28:47 +02:00
Edresson Casanova 01839af926 Bug fix on XTTS masking training 2023-10-24 18:30:14 -03:00
Edresson Casanova 8af3d2dbcd Add a dedicated workflow for XTTS tests 2023-10-24 09:52:44 -03:00
VLT Media 818aa0eb7e
Merge branch 'coqui-ai:dev' into dev 2023-10-23 23:36:33 -04:00
Edresson Casanova de1d521c8a Update XTTS docs 2023-10-23 13:35:15 -03:00
Edresson Casanova 0f96abb5ec Add FT inference example on XTTS docs 2023-10-23 13:23:30 -03:00
Edresson Casanova 67ca70aff4 Fix Delightful TTS layers unit test 2023-10-23 11:47:10 -03:00
Edresson Casanova 37b7945474 Update XTTS train not implemented error to point to the XTTS docs 2023-10-23 11:39:17 -03:00
Edresson Casanova 1ee8096799 Update XTTS docs 2023-10-23 11:13:09 -03:00
Edresson Casanova 6fefc36e5a Update XTTS docs 2023-10-23 11:03:57 -03:00
Edresson Casanova 8853e1c3ec Update XTTS recipe to only download checkpoint if it is needed 2023-10-23 10:45:41 -03:00
Edresson Casanova 653f2e75ef Update xtts trainer recipe 2023-10-23 09:58:16 -03:00
Edresson Casanova e8a1a50273 Remove unused vars in Delightful TTS layers tests 2023-10-23 09:26:36 -03:00
Edresson Casanova ec7f54768a Rebase bug fix and update recipe 2023-10-21 17:37:51 -03:00
Edresson Casanova affaf11148 Add XTTS training unit test 2023-10-21 13:41:12 -03:00
Edresson Casanova 1f92741d6a Fix issue #2971 2023-10-21 13:37:21 -03:00
Edresson Casanova 94dcf84979 Rename XTTS recipe 2023-10-21 13:37:21 -03:00
Edresson Casanova 5f98dbeec9 Update Ljspeech XTTS recipe 2023-10-21 13:37:21 -03:00
Edresson Casanova 469d624615 Update LJspeech XTTS recipe 2023-10-21 13:37:21 -03:00
Edresson Casanova 9e3598c3b7 Bug Fix on inference using XTTS trainer checkpoint 2023-10-21 13:37:21 -03:00
Edresson Casanova c4ceaabe2c Add test sentences during the training 2023-10-21 13:33:56 -03:00
Edresson Casanova 2f868dd5c2 Bug fix on reproducible evaluation 2023-10-21 13:33:56 -03:00
Edresson Casanova bafab049c2 Add prompting masking 2023-10-21 13:33:56 -03:00
Edresson Casanova 47d613df3a Add reproducible evaluation 2023-10-21 13:33:56 -03:00
Edresson Casanova 40a4e631ea Update mel spectrogram for the style encoder 2023-10-21 13:33:56 -03:00
Edresson Casanova a32961bcb4 Add XTTS base training code 2023-10-21 13:33:56 -03:00
Eren Gölge 1e152692ed
Bump up to v0.18.2 2023-10-21 17:29:53 +02:00
Eren Gölge 420a90ed63
Merge pull request #3096 from coqui-ai/fix-xtts-v1.1
Fix xtts v1.1
2023-10-21 17:28:58 +02:00
Julian Weber dad6a7b0b6
Preserve [ja] token of the text processing 2023-10-21 11:26:03 +02:00
Julian Weber c7a16042e3
Remove global cutlet import 2023-10-21 11:18:58 +02:00
Edresson Casanova 414f0de0a1
Bump up to v0.18.1 2023-10-20 17:30:58 -03:00
Edresson Casanova 59576fc0ec
Bug fix on XTTS v1.1 inference (#3093)
* Bug fix on XTTS v1.1 inference

* Update .models.json

---------

Co-authored-by: Julian Weber <julian.weber@hotmail.fr>
2023-10-20 17:29:43 -03:00
Eren Gölge 85e7323739
Bump up to v0.18.0 2023-10-20 16:03:24 +02:00
Julian Weber cf97116185
XTTS v1.1 (#3089)
* Add support for ne_hifigan

* Update model.json

* Update hash

* Fix model loading

* Enhance text_normalization

* Add xtts to zoo test exception

* Add model hash check

* Add get_number_tokens
2023-10-20 16:02:08 +02:00
Eren Gölge 747f688dc3
Bump up to v0.17.10 2023-10-19 12:00:15 +02:00
Eren Gölge 93e6961bb5
Update .models.json 2023-10-19 11:59:49 +02:00
Eren Gölge bf68848f38
Bump up to v0.17.9 2023-10-19 11:22:42 +02:00
Eren Gölge c3b011217d
Update .models.json 2023-10-19 11:21:21 +02:00
Julian Weber d21f15cc85
fix readme (#3071)
* fix readme

* fix inference.md
2023-10-17 10:27:11 +02:00
Julian Weber dcce1644b7
Fix doc dataset (#3070)
* fix formatting dataset doc

* fix autocomplete
2023-10-16 12:29:52 +02:00
David Garvey a151d70242
Add stdout option (#3027)
* add add cli options for play and speed
--play argument uses simpleaudio to play the tts wav
--speed <float 0.0-2.0> passes speed argument to Coqui Studio models

* remove simpleaudio not referenced in file

* fix simpleaudio dependency version

* add ALSA headers for simpleaudio compilation

* Dockerfile ALSA headers for simpleaudio

* base changes to use stdout instead of play audio
Considering conversion to pipe wav data for audio playback with ohter program
like aplay.

This is incomplete code. Using to get feedback before proceeding with
implementation.

* remove play for pipe_out arg that suppresses stdout
removed play and simpleaudio dependency in place of pipe
fuctionality to allow passing wav file data to a program
dedicated to playing audio.

* scipy.io.wavfile.write fails with /dev/null target

* Streaming inference for XTTS 🚀 (#3035)

* v0.17.7

* Redownload XTTS with the local and remote config do not match

* Remove unused method

* Print a message when it is already donwloaded

* Try-except to present error when the user dont have connection

* Fix style

* 0.17.8

* v0.17.8

---------

Co-authored-by: Julian Weber <julian.weber@hotmail.fr>
Co-authored-by: Eren Gölge <erogol@hotmail.com>
Co-authored-by: Edresson Casanova <edresson1@gmail.com>
Co-authored-by: ggoknar <ggoknar@coqui.ai>
2023-10-16 12:07:21 +02:00
Eren Gölge cae185fd16
Update README.md 2023-10-16 12:00:59 +02:00
Subash-Lamichhane b4666bb75e
fixed typo of /docs (#3065) 2023-10-16 11:57:15 +02:00
Subash-Lamichhane 3d146422c2
fixed typo of docs\source\implementing_a_new_model.md (#3066) 2023-10-16 11:57:04 +02:00
Dusty Hagstrom 13cd076a7f
Synthesizer skips over embeddings file if model only has one speaker (#2587)
* It looks like the Neon model is special in that t does not have a speaker_name and it wants to get the only item available. This was blocking a valid model with one speaker and a d_vector_file from being executed to get the embedding.

* Update synthesizer.py

oh my how embarrassing
2023-10-16 11:55:45 +02:00
Meryem Sakin e4b8d71f2b
Update AnalyzeDataset.ipynb (#2783) 2023-10-16 11:52:37 +02:00
Eren Gölge b25d96ecee
Merge pull request #3058 from coqui-ai/spkr_enc_3020
fixed bugs in fastpitch tts synthesis
2023-10-14 11:40:31 +02:00
Aya Jafari ffddf10458 unit test fix 2023-10-13 10:56:47 -03:00
Aya Jafari 6eaecab0ca fixed bugs in fastpitch tts synthesis 2023-10-10 23:02:31 -03:00
ggoknar 99635193f5 v0.17.8 2023-10-07 01:14:05 +03:00
ggoknar 3bb51b1276 0.17.8 2023-10-07 01:13:02 +03:00
Gorkem 0f46757c47
Merge pull request #3038 from coqui-ai/xtts_redonwload
XTTS redownload if needed
2023-10-07 01:02:44 +03:00
Edresson Casanova 2852404bdf Fix style 2023-10-06 17:42:46 -03:00
Edresson Casanova 99650044a4 Try-except to present error when the user dont have connection 2023-10-06 17:37:05 -03:00
Edresson Casanova 529ea3f67f Print a message when it is already donwloaded 2023-10-06 17:26:40 -03:00
Edresson Casanova ee1ef1c51e Remove unused method 2023-10-06 17:21:22 -03:00
Edresson Casanova 4a6103fec9 Redownload XTTS with the local and remote config do not match 2023-10-06 17:16:30 -03:00
Eren Gölge 0520697b5f
v0.17.7 2023-10-06 18:35:26 +02:00
Julian Weber e5e0cbffc9
Streaming inference for XTTS 🚀 (#3035) 2023-10-06 18:34:06 +02:00
OPERATOR 2150136210
None is not able to be read for "XTTS", fixes crash if its set to None. (#3009) 2023-10-02 12:53:36 +02:00
Anupam Maurya f133b9d2d7
Upgrade and Optimize TTS Code in extractttsspectrogram.ipynb (#3012) 2023-10-02 12:51:55 +02:00
Eren Gölge 155c5fc0bd
v0.17.6 2023-09-29 23:44:09 +02:00
Edresson Casanova 4c3c11c958
Tortoise inference fix and fix zoo unit tests (#3010) 2023-09-29 13:40:57 +02:00
Eren Gölge bb05dcb9b4
Merge pull request #2922 from coqui-ai/be_tts
Adding Belarusian TTS model
2023-09-27 09:48:28 +02:00
Eren Gölge 8cba47191f
Merge pull request #2993 from akx/tts-readme
Ensure `tts` CLI tool readme and usage is in sync
2023-09-27 09:46:54 +02:00
Eren Gölge e39da6147f
Merge pull request #2999 from akx/remove-unnecessary-black-config
Remove unnecessary black exclude config
2023-09-27 09:43:06 +02:00
Eren Gölge 536a12b045
Merge pull request #3001 from akx/fix-deps-again
Loosen dependency pins
2023-09-27 09:42:27 +02:00
Eren Gölge ea51a7ffcc
Merge pull request #3003 from akx/duplicate-code-removal
Duplicate code removal
2023-09-27 09:41:35 +02:00
Aarni Koskela 0dbe7cbcc4 Remove duplicate convert_pad_shape 2023-09-27 01:10:48 +03:00
Aarni Koskela 33a7c722f6 Merge duplicate on_train_step_start functions in delightful_tts 2023-09-27 01:10:44 +03:00
Aarni Koskela 861c68b0b8 Rename misnamed setter 2023-09-27 01:09:59 +03:00
Aarni Koskela 09e14e68db Remove duplicate get_named_beta_schedules 2023-09-27 01:09:59 +03:00
Aarni Koskela 59f85a7122 Remove duplicate code from xtts.tokenizer 2023-09-27 01:09:59 +03:00
Aarni Koskela 6277f09c5f requirements.txt: loosen pandas pin (1.4 would need to be compiled from source on macs) 2023-09-26 20:43:59 +03:00
Aarni Koskela 8bb2d652ca pyproject.toml: loosen dependencies to avoid building from source 2023-09-26 20:41:26 +03:00
Aarni Koskela 94c5fd0765 Remove unnecessary black exclude config
It seems to have been copy-pasted from the Black docs.
2023-09-26 16:02:55 +03:00
Aarni Koskela 0a82f063cc Late-import main TTS libraries in `tts` CLI 2023-09-26 15:38:56 +03:00
Aarni Koskela 5c047cf304 Ensure `tts` CLI tool readme and usage help is in sync 2023-09-26 15:38:56 +03:00
Eren Gölge 0b95b88f13
Bum up to v0.17.5 2023-09-25 18:16:45 +02:00
Eren Gölge 359755cca0
Merge pull request #2990 from coqui-ai/fix-TTS-install
fix package versions
2023-09-25 18:10:29 +02:00
WeberJulian 089ad66df2 Lower the versions constraints 2023-09-25 17:00:41 +02:00
WeberJulian bbfdfbffdf Update transformers to latest 2023-09-25 11:46:38 +02:00
WeberJulian f1c1d14c54 Add back umap 2023-09-25 11:12:01 +02:00
WeberJulian a2a15392e0 fix package versions 2023-09-25 11:01:36 +02:00
VLT Media dd73910651
Bug: self.model_name needed to be initialized.
Bug: self.model_name needed to be initialize to get around a bug that automatically crashes when the user provides the model paths but no model_name when initializing the TTS object.
2023-09-23 01:41:35 -04:00
loupzeur da8b6bbce1
fix: xtts not taking into account device flag (#2951)
* fix: xtts not taking into account device flag

* Style changes

---------

Co-authored-by: Julian Weber <julian.weber@hotmail.fr>
2023-09-20 09:57:02 +02:00
Omar Sanseviero 335ae63e01
Add coqui blog post (#2949)
* Update README.md

* Update README.md

---------

Co-authored-by: Edresson Casanova <edresson1@gmail.com>
2023-09-19 19:57:09 -03:00
Julian Weber 6916aa37ab
Fix fsspec requirement (#2970)
* Fix requirment for fsspec

* Use the right version this time
2023-09-19 15:54:12 -03:00
Reuben Morais f829bf50f8
Bump version to v0.17.4 (really) 2023-09-15 16:40:34 +02:00
Eren G??lge aa8fa4756e Bump up to v0.17.4 2023-09-14 17:52:44 +02:00
Eren G??lge 9d0b76ce23 Check env var for COQUI_TOS_AGREED 2023-09-14 17:51:40 +02:00
Eren G??lge 13dd7c4c9e Bump up to v0.17.2 2023-09-14 15:24:05 +02:00
Eren G??lge ded7fd4fb2 Make style 2023-09-14 15:23:37 +02:00
Eren G??lge 44b61d2b92 Fixup 2023-09-14 15:22:54 +02:00
Eren Gölge 623ea41634
Fix model tests (#2943) 2023-09-14 15:21:48 +02:00
Eren G??lge af62613c86 Bump up to v0.17.1 2023-09-13 18:23:39 +02:00
Eren G??lge ee7cee0e35 Fixup 2023-09-13 18:21:44 +02:00
Eren G??lge 5dcf9ae311 Bump up v0.17.0 2023-09-13 18:04:26 +02:00
Eren Gölge 4033db5f4b 🔥 XTTS implementation 2023-09-13 17:51:24 +02:00
Edresson Casanova 4d3f23b5d3
Add CML-TTS dataset YourTTS training recipe (#2934) 2023-09-12 11:49:14 +02:00
Eren Gölge 9533f8656c Make style 2023-09-04 13:58:37 +02:00
Eren Gölge 562a9509f2 Add BE model 2023-09-04 13:57:03 +02:00
Eren Gölge b4c82685a7 Add model entries 2023-09-04 13:04:58 +02:00
T145 cdc971ff74
Fixed spectrogram checking on librosa 0.10.x (#2899) 2023-09-04 12:58:27 +02:00
Cohee b3b1555d82
Fix exception handling in manage.py (#2912) 2023-09-04 12:54:30 +02:00
Eren G??lge 40b527345f Bump up to v0.16.6 2023-09-04 12:51:53 +02:00
Eren Gölge d1d95707bd
Update docs (#2919) 2023-09-04 12:28:36 +02:00
Unik 32b8ebb633
Updated scipy version (#2914) 2023-09-04 11:39:19 +02:00
Aleś Bułojčyk fead04f779
Add phonemizer for Belarusian language (#2856) 2023-08-28 11:20:45 +02:00
Jake Tae b79b6f0762
feature: add device flag to tts cli (#2875) 2023-08-28 11:20:12 +02:00
Jake Tae fa0cbd78fe
Update README with new device API (#2876)
* docs: update readme w/ .to(device) api

* docs: add .to(device) in python quickstart

* docs: move section header out of comment

* chore: use device instead of hard-coded string

* docs: update inference.md
2023-08-28 11:19:00 +02:00
Eren Gölge c0b5e61749
Bump up to v0.16.5 2023-08-26 12:00:25 +02:00
Eren Gölge a7a96d08dd
Fix loading Bark (#2893)
* Fixup hubert path

* Make style
2023-08-26 11:59:00 +02:00
Eren Gölge 04a36a727b
Bump up to v0.16.4 2023-08-26 10:39:48 +02:00
Eren Gölge a96562a750
Update .models.json 2023-08-26 10:36:40 +02:00
Jake Tae 409db505d2
Add device support in TTS and Synthesizer (#2855)
* fix: resolve merge conflicts

* fix: retain backwards compatability in functions

* feature: utilize device for voice transfer

* feature: use device for vocoder

* chore: cleanup vocoder cpu logic

* fix: add necessary vocoder output device check

* fix: add necessary vocoder output device check

* fix: indentation

* fix: check if waveform is pt tensor before cpu conversion

---------

Co-authored-by: Jake Tae <jaketae@Jakes-MacBook-Pro-2.local>
2023-08-14 21:04:44 +02:00
Julian Weber febcaf710a
Add customizable data home path (#2871)
* Add customizable data home path

* Add TTS_HOME as an option
2023-08-14 21:02:48 +02:00
Eren Gölge c4e5effab9 Bump up to v0.16.3 2023-08-13 12:22:04 +02:00
Michael New 1f9d600b83
Denote human voices in README.md (#2851) 2023-08-13 12:15:17 +02:00
Eren Gölge 3a104d5c49
Update Studio API for XTTS (#2861)
* Update Studio API for XTTS

* Update the docs

* Update README.md

* Update README.md

Update README
2023-08-13 12:04:12 +02:00
Eren G??lge 37b558ccb9 Make style 2023-08-11 12:55:23 +02:00
Eren G??lge 9a8352b8da Fix import error with Bark 2023-08-11 03:33:59 +02:00
Eren Gölge c87377b713
Bump up to v0.16.2 2023-08-07 13:21:14 +02:00
Eren Gölge 4186f42b21
Handle missing JA phonemizer (#2843)
* Handle missing JA phonemizer

* Make style
2023-08-07 13:19:38 +02:00
Eren Gölge 48f8133eae
Fix imports (#2845) 2023-08-07 13:19:26 +02:00
Javier 4e7f8cd021
Add fairseq onnx support and strict configuration, fixes some onnx errors (#2831) 2023-08-04 11:02:59 +02:00
ChaseC 52a528cfcf
add post functionality to /api/tts (#2836) 2023-08-04 10:54:20 +02:00
Eren Gölge dc04baa1ee
Bump up to v0.16.1 2023-07-31 15:54:45 +02:00
Eren Gölge 17ddd65741 Please p3.11 2023-07-31 15:53:19 +02:00
Eren Gölge 69f080eb47
Fix DelightfulTTS (#2823)
* Fix tests

* Make style
2023-07-31 13:52:45 +02:00
Eren Gölge 483888b9d8
Add kwargs to ignore extra arguments w/o error (#2822) 2023-07-31 11:37:35 +02:00
AWAS666 9e74b51aa6
Delightful TTS VCTK recipe fixes (#2808)
* fix: wrong import class

* fix: formatter name missing

* feat: get rid of clearml
2023-07-31 10:27:42 +02:00
Aleś Bułojčyk d124f78430
Recipe for Belarusian TTS (#2756)
* Changes from jhlfrfufyfn <jhlfrfufyfn@gmail.com>

* Recipe for Belarusian TTS

---------

Co-authored-by: jhlfrfufyfn <jhlfrfufyfn@gmail.com>
2023-07-31 10:26:21 +02:00
Javier c140df5a58
Adds multi-language support for VITS onnx, fixes onnx inference error when speaker_id is None or not passed, fixes onnx exporting for models with init_discriminator=false (#2816) 2023-07-31 10:19:49 +02:00
Eren Gölge b739326503
Bump up to v0.16.0 2023-07-24 16:04:10 +02:00
Eren Gölge 8aacb81849
Fix Tortoise load (#2791)
* Remove key prunning in tortoise

* Make lint
2023-07-24 13:42:47 +02:00
Eren Gölge b3472a739e
Update README.md 2023-07-24 13:42:20 +02:00
logan hart 6fdb88f8e2
Add Delightful-TTS implementation (#2095)
* add configs

* Update config file

* Add model configs

* Add model layers

* Add layer files

* Add layer modules

* change config names

* Add emotion manager

* fIX missing ap bug

* Fix missing ap bug

* Add base TTS e2e class

* Fix wrong variable name in load_tts_samples

* Add training script

* Remove range predictor and gaussian upsampling

* Add helper function

* Add vctk recipe

* Add conformer docs

* Fix linting in conformer.py

* Add Docs

* remove duplicate import

* refactor args

* Fix bugs

* Removew emotion embedding

* remove unused arg

* Remove emotion embedding arg

* Remove emotion embedding arg

* fix style issues

* Fix bugs

* Fix bugs

* Add unittests

* make style

* fix formatter bug

* fix test

* Add pyworld compute pitch func

* Update requirments.txt

* Fix dataset Bug

* Chnge layer norm to instance norm

* Add missing import

* Remove emotions.py

* remove ssim loss

* Add init layers func to aligner

* refactor model layers

* remove audio_config arg

* Rename loss func

* Rename to delightful-tts

* Rename loss func

* Remove unused modules

* refactor imports

* replace audio config with audio processor

* Add change sample rate option

* remove broken resample func

* update recipe

* fix style, add config docs

* fix tests and multispeaker embd dim

* remove pyworld

* Make style and fix inference

* Split tts tests

* Fixup

* Fixup

* Fixup

* Add argument names

* Set "random" speaker in the model Tortoise/Bark

* Use a diff f0_cache path for delightfull tts

* Fix delightful speaker handling

* Fix lint

* Make style

---------

Co-authored-by: loganhart420 <loganartpersonal@gmail.com>
Co-authored-by: Eren Gölge <erogol@hotmail.com>
2023-07-24 13:41:26 +02:00
Eren Gölge f24c5e0276 Update README 2023-07-24 13:30:19 +02:00
Eren Gölge 1652598a33 Test synthesize api separately 2023-07-24 12:38:20 +02:00
Eren Gölge 0de12ec5aa
API tests (#2790)
* Separate API tests and only run when uplifted

* Make style
2023-07-24 12:14:21 +02:00
Paul O'Leary McCann c0aabb8596
Make Japanese-specific dependencies optional (#2776)
* Don't install MeCab by default

* Add optional [ja] deps, like [dev] etc

* Add JA requirements file

* Add JA requirements to requirements_all

This should help the tests run.
2023-07-24 11:28:27 +02:00
Aleś Bułojčyk e5fb0d9627
Fix share model page URL (#2757) 2023-07-09 12:19:49 +02:00
Eren Gölge 672ec3b35e
Fix #2749 (#2750) 2023-07-08 11:40:44 +02:00
Eren Gölge b5cd644132
Bump up to v0.15.6 2023-07-08 10:33:09 +02:00
Eren Gölge a2984fb435
Fix #2745 (#2748) 2023-07-07 20:23:27 +02:00
Eren Gölge 7b5c8422c8
Export multispeaker onnx (#2743) 2023-07-06 13:36:50 +02:00
Eren Gölge 08bc758cad
Merge pull request #2741 from coqui-ai/merge_2651
Resolve conflicts
2023-07-06 09:53:48 +02:00
JiangCheng 53938e2d32 Squashed commit of the following:
commit dd612fd72e
Author: JiangCheng <jiangcheng@kezaihui.com>
Date:   Mon Jun 5 16:04:54 2023 +0800

    Failed to download the file and need to delete the created file path
2023-07-05 12:08:05 +02:00
Eren Gölge e42a72eb79 Fix typo 2023-07-04 12:14:54 +02:00
Eren Gölge 229cfbdf8a Update README.md 2023-07-04 12:09:50 +02:00
Wouter van der Velde d611067d50
fixed small spelling mistakes (#2551) 2023-07-04 11:42:54 +02:00
ZhouGongZaiShi d5f16d77c2
delete meaningless print() (#2662) 2023-07-04 11:38:17 +02:00
PiaoYang 630327c4e6
Update compute_embeddings.py (#2668)
* [Typo] Fix variable name. More readable description.

Update train_yourtts.py

Reformat.

Reformat using black again.

* Add `old_append`. Fix bool argparse.

* Reformat.
2023-07-04 11:37:47 +02:00
ChaseC 8957799e45
fix loading of model and vocoder configs (#2698) 2023-07-04 11:32:00 +02:00
Eren Gölge 505ac1aa8f
Bump up to v0.15.5 2023-07-03 11:18:06 +02:00
Eren Gölge 453d04836b
Merge pull request #2733 from coqui-ai/update_docs
Update docs and credits
2023-07-03 11:17:15 +02:00
Fred f6eaa61afe Adding checkpoint model 2023-07-02 18:55:50 -03:00
Eren Gölge 9b041f958b Update docs and credits 2023-07-02 13:09:40 +02:00
Eren G??lge 21a3f280de Bump up to v0.15.4 2023-06-30 15:05:00 +02:00
Eren G??lge 90cf712bb4 Update docs 2023-06-30 14:58:15 +02:00
Eren Gölge 588fe21310 Update docs 2023-06-30 14:40:54 +02:00
Eren Gölge f9cde7bb1b Bump up to v0.15.3 2023-06-30 14:30:18 +02:00
Eren Gölge 472629d6fd Docs update 2023-06-30 14:30:00 +02:00
Eren G??lge 413a345d66 Bump up to v0.15.2 2023-06-30 14:16:47 +02:00
Eren G??lge 53d3bb07f4 Update docs python ver 2023-06-30 14:16:47 +02:00
Eren G??lge cb9c320691 Fixup 2023-06-30 14:13:11 +02:00
Eren G??lge dfd8d313a2 Bump up to v1.5.1 2023-06-29 17:53:09 +02:00
Eren G??lge 797eab2dd0 Add bark docs 2023-06-29 17:52:42 +02:00
Eren G??lge a035b25340 Bump up to v0.15.0 2023-06-28 15:24:20 +02:00
Eren G??lge b8c2ab9833 [ci skip] Update README 2023-06-28 12:30:54 +02:00
Eren G??lge 34b9a18c47 Fixup 2023-06-28 12:26:04 +02:00
Eren G??lge 91cc11d636 Remove commented codes 2023-06-28 12:14:37 +02:00
Eren G??lge 6b9ebf5aab Merge branch 'p3_11' into dev 2023-06-28 12:13:04 +02:00
Eren Gölge c844b6570a
Inference API for 🐶Bark (#2685)
* Add bark requirements

* Draft Bark implementation

* Download HF models

* Update synthesizer

* Add bark model

* Make style

* Update pylintrc

* Update model URLs

* Update Bark Config

* Fix here and ther

* Make style

* Make lint

* Update requirements

* Update requirements
2023-06-28 11:55:27 +02:00
Eren G??lge 4786548287 Prevent running bark test on CI 2023-06-28 11:24:45 +02:00
Eren G??lge f6fa1dbc9f Fix sed 2023-06-27 15:02:49 +02:00
Eren G??lge 3933b47f33 Fixup 2023-06-27 00:08:06 +02:00
Eren G??lge d659dbe3c6 Remove fairseq 2023-06-26 19:31:56 +02:00
Eren G??lge a13b1352a4 Fixup 2023-06-26 19:30:26 +02:00
Eren G??lge 17ac188958 Drop fairseq for Hubert 2023-06-26 19:27:48 +02:00
Eren G??lge c03768bb53 Make style 2023-06-26 17:16:26 +02:00
Eren G??lge a1c431e6a9 Fixups 2023-06-26 12:55:18 +02:00
Eren G??lge 115baf7e47 Drop other p3.8 refs 2023-06-26 11:42:57 +02:00
Eren G??lge 1cce0e8bcb Drop p3.8 from CI 2023-06-26 11:40:58 +02:00
Eren G??lge 0cce2c0e89 Correct python_version 2023-06-22 14:07:35 +02:00
Eren G??lge 8c1d8df759 Disable linter until I've some peace of mind 2023-06-22 13:58:55 +02:00
Eren G??lge a58fb6c01b Update requirements 2023-06-22 13:53:19 +02:00
Eren G??lge ddbb27547a Update CI 2023-06-22 13:51:58 +02:00
Eren G??lge e888e8a56d Fix manage 2023-06-22 10:13:20 +02:00
Eren Gölge fff8b762bc
Merge branch 'dev' into bark 2023-06-21 15:49:05 +02:00
Eren Gölge 4cf8652392
Fix Tortoise load (#2697)
* Handle missing gpt weights

* Make style

* Fix lint
2023-06-21 15:42:01 +02:00
Eren G??lge 9190f1a5f3 Update requirements 2023-06-21 12:22:37 +02:00
Eren G??lge 8597ee13af Update requirements 2023-06-21 12:21:22 +02:00
Eren G??lge cf98ae04df Make lint 2023-06-21 12:05:08 +02:00
Eren G??lge 3b9fca2398 Make style 2023-06-21 12:02:06 +02:00
Eren G??lge 0f8932a6a9 Fix here and ther 2023-06-21 11:59:27 +02:00
Eren G??lge 03c347b7f3 Update Bark Config 2023-06-21 11:58:18 +02:00
Eren G??lge 695e862aad Update model URLs 2023-06-21 11:57:46 +02:00
Eren G??lge e89aa97025 Update pylintrc 2023-06-21 11:57:33 +02:00
Eren G??lge f4c88ed677 Make style 2023-06-19 14:22:32 +02:00
Eren G??lge 37b708dac7 Add bark model 2023-06-19 14:16:06 +02:00
Eren G??lge 2364c38d16 Update synthesizer 2023-06-19 14:15:21 +02:00
Eren G??lge 5a31fad502 Download HF models 2023-06-19 14:14:04 +02:00
Eren G??lge f59da4dba5 Draft Bark implementation 2023-06-12 14:32:39 +02:00
Tsai Meng-Ting d65819422b
Update stochastic_duration_predictor.py (#2663)
fix a typo
2023-06-12 11:10:54 +02:00
Reuben Morais 2d967b786d
Fix typo in README [skip ci] 2023-06-08 09:47:10 +02:00
Eren Gölge 49cf6a5d62 Bump up to v0.14.3 2023-06-06 09:41:59 +02:00
Eren Gölge 8e415732dd Fixup 2023-06-06 09:41:46 +02:00
Eren Gölge 547a72c97d Fixup 2023-06-05 22:38:56 +02:00
Eren Gölge a494f0c92a Bump up to v0.14.1 2023-06-05 11:29:10 +02:00
Eren Gölge 50b1074779 Make `tts` ready 2023-06-05 11:29:10 +02:00
Eren Gölge e785d101a1
Port Fairseq TTS models (#2628)
* Load fairseq models

* Add docs and missing files

* Managing fairseq models and docs for API

* Make style

* Use scarf URL

* Add tests

* Fix URL

* Pass cpu

* Make lint

* Fixup

* Make lint

* fixup

* Fixup

* Change tokenization order

* Update README

* Fixup

* Fixup
2023-06-05 11:15:13 +02:00
Shukrullo Turgunov 0d5e68a09f
fix typo (#2647)
* fix typo

* typo fix
2023-06-05 09:58:16 +02:00
Eren G??lge deebc0cc16 Add bark requirements 2023-05-23 10:12:26 +02:00
Reuben Morais 23a7a9a363
Fetch all built-in speakers (#2626) 2023-05-22 17:28:08 +02:00
Eren Gölge aef7f6d980 Bump up to v0.14.1 2023-05-18 11:13:09 +02:00
Eren Gölge 9e99e0f42d Disable reduction 2023-05-18 11:12:51 +02:00
Eren Gölge bc0a532c7a
Bump up to v0.14.0 2023-05-16 10:08:41 +02:00
Eren Gölge 7d6e18629f Add Tortoise to README 2023-05-16 01:33:56 +02:00
Eren Gölge 01aa0034b2 Fixup docs 2023-05-16 01:24:25 +02:00
Eren Gölge 4de797bb11
Draft ONNX export for VITS (#2563)
* Draft ONNX export for VITS

Could not get it work to output variable length sequence

* Fixup for onnx constant output

* Make style

* Remove commented code
2023-05-16 01:07:56 +02:00
Eren Gölge 16c9df0dfe
Fix API CI (#2616)
* Fix indentation

* Fixup

* Make style
2023-05-16 01:05:35 +02:00
manmay nakhashi a3d5801c44
Tortoise TTS inference (#2547)
* initial commit

* Tortoise inference

* revert path change

* style fix

* remove accidental remove

* style fixes

* style fixes

* removed unwanted assests and deps

* remove changes

* remove cvvp

* style fix black

* added tortoise config and updated config and args, refactoring the code

* added tortoise to api

* Pull mel_norm from url

* Use TTS cleaners

* Let download model files

* add ability to pass tortoise presets through coqui api

* fix tests

* fix style and tests

* fix tts commandline for tortoise

* Add config.json to tortoise

* Use kwargs

* Use regular model api for loading tortoise

* Add load from dir to synthesizer

* Fix Tortoise floats

* Use model_dir when there are multiple urls

* Use `synthesize` when exists

* lint fixes and resolve preset bug

* resolve a download bug and update model link

* fix json

* do tortoise inference from voice dir

* fix

* fix test

* fix speaker id and remove assests

* update inference_tests.yml

* replace inference_test.yml

* fix extra dir as None

* fix tests

* remove space

* Reformat docstring

* Add docs

* Update docs

* lint fixes

---------

Co-authored-by: Eren Gölge <egolge@coqui.ai>
Co-authored-by: Eren Gölge <erogol@hotmail.com>
2023-05-16 00:58:21 +02:00
Eren Gölge 0b6b957e76 Drop API tests when token is not available.
This was a neccessary test but for my sanity
I just drop it until finding the reason why
the secret is not recognized in PRs CI tests.
2023-05-11 18:13:37 +02:00
Eren Gölge 9b5822d625
Update VAD for silence trimming. (#2604)
* Update vad for mp3 and fault tolerance

* Make style

* Remove importt

* Remove stupid defaults
2023-05-11 11:09:23 +02:00
Eren Gölge 5c89c621ca
Warn when lang is not avail (#2460)
* Warn when lang is not avail

* Make style

* Make lint
2023-05-11 11:08:43 +02:00
Eren Gölge dfb51e06b2
Add jenny model (#2603) 2023-05-08 12:05:40 +02:00
Atharva Shah ba40a1c5c6
Update README.md (#2577)
Update link to point to blob instead of edit
2023-05-08 11:18:55 +02:00
Michael Görner 27e237ed08
use default_factory for audio parameter (#2576)
Python 3.11 complains about the mutable default and other members
were already adapted to use the factory, so I expect this line just
went unnoticed until now.
2023-05-08 11:17:36 +02:00
Julian Weber eec6beb966
rm pip cache (#2600) 2023-05-08 10:38:21 +02:00
Edresson Casanova 51a3d45025
Add FR and ES gruut languages as requirement to avoid inference issues (#2572)
* Add all gruut supported languages as requeriment to avoid inference issues

* Remove unused gruut languages
2023-05-03 09:49:01 -03:00
prakharpbuf c1875f68df
typos and minor fixes (#2508)
* Update tacotron1-2.md

* Update README.md

* Update Tutorial_2_train_your_first_TTS_model.ipynb

* Update synthesizer.py

There is no arg called --speaker_name

* Update formatting_your_dataset.md

* Update AnalyzeDataset.ipynb

* Update AnalyzeDataset.ipynb

* Update AnalyzeDataset.ipynb

* Update finetuning.md

* Update train_yourtts.py

* Update train_yourtts.py

* Update train_yourtts.py

* Update finetuning.md
2023-04-26 15:22:57 +02:00
Eren Gölge 2071088bab
Bump up to v0.13.3 2023-04-17 16:13:35 +02:00
Eren Gölge b2bc2ac797
Merge pull request #2532 from coqui-ai/bangla_model
Bangla models
2023-04-17 16:13:00 +02:00
Eren Gölge 1a6a5710fd Make lint 2023-04-17 15:02:56 +02:00
Eren Gölge a44a0e1fd2 Update model urls 2023-04-17 14:53:27 +02:00
Eren Gölge d3e215f8bd Add link 2023-04-17 13:48:55 +02:00
Eren Gölge 2533a18d62 Add BN tests 2023-04-17 13:37:10 +02:00
Eren Gölge 2d49c05259 Remove import 2023-04-17 13:05:29 +02:00
Eren Gölge 5e5768d784 Fix API 2023-04-17 13:05:19 +02:00
Eren Gölge bce819a624 Add docs for adding a new lang frontend 2023-04-17 12:54:35 +02:00
Eren Gölge 6505553da5 Add BN requirements 2023-04-17 12:54:14 +02:00
Eren Gölge cd83991067 Add BN phonemizer 2023-04-17 12:54:00 +02:00
Eren Gölge 36be05290d Add models 2023-04-17 12:52:32 +02:00
Eren Gölge e4c5c27854
Bump up to v0.13.2 2023-04-14 10:23:39 +02:00
Eren Gölge dba5cec497
Merge pull request #2509 from coqui-ai/update_vad
Update VAD
2023-04-13 19:35:17 +02:00
Eren Gölge e07c6f54fd
Merge pull request #2515 from coqui-ai/tts_cmd
🐸Studio models by `tts`
2023-04-13 19:34:28 +02:00
Eren Gölge 5a9bda13f3 Make style 2023-04-13 14:19:06 +02:00
Eren Gölge c9375e4b8b Make style 2023-04-13 14:17:06 +02:00
Eren Gölge 758ef84cc2 Using 🐸Studio models with `tts` command 2023-04-13 14:14:41 +02:00
Eren G??lge 537dc0e933 Update VAD 2023-04-13 00:39:46 +02:00
Eren Gölge e33e7170ed Bump up to v0.13.1 2023-04-12 16:20:53 +02:00
Eren Gölge 8da3342676 Ping API 2023-04-12 16:20:53 +02:00
Eren Gölge 73d963718a
Merge pull request #2495 from coqui-ai/api_voice_conversion
Api voice conversion
2023-04-11 16:40:14 +02:00
Eren Gölge cbb592b295 Fixup 2023-04-10 14:50:11 +02:00
Eren Gölge b8b9f09de5 Fixup 2023-04-10 14:06:31 +02:00
Eren Gölge 76511972e9 Add freevc to the models list 2023-04-10 14:03:08 +02:00
Eren Gölge 209f0a509a Add voice conversion api tes 2023-04-10 13:37:47 +02:00
Eren Gölge a49c1931d9 Fixup 2023-04-10 13:33:42 +02:00
Eren Gölge 5bd1fb6b2c Fix API for voice conversion 2023-04-10 13:32:16 +02:00
Eren Gölge 30109af2a0
Merge pull request #2480 from MattyB95/librosa_v0.10.0
Update Librosa Version To V0.10.0
2023-04-07 12:32:33 +02:00
Matthew Boakes 5bdd6f7c18 Updated Librosa Dependency Specification 2023-04-06 12:36:24 +01:00
Eren Gölge 1233365cf4 Bump up to v0.13.0 2023-04-05 15:09:31 +02:00
Eren Gölge ad8b9bf2be
🐸 Coqui Studio API integration (#2484)
* Warn when lang is not avail

* Make style

* Implement Coqui Studio API

* Test

* Update docs

* Set action

* Make style

* Make lint

* Update README

* Make style

* Fix action

* Run actions
2023-04-05 15:06:50 +02:00
Wesley Pyburn ce79160576
Fix errors in README.md (#2478)
Running the sample code below results in an error `language_id = self.tts_model.language_manager.name_to_id[language_name]`.
The fix is running the code with the correct language strings, the readme has been updated in this PR to work.

I assume this small typo leads to #2456 and #2458
2023-04-05 12:23:07 +02:00
Matthew Boakes 4c829e74a1 Update Librosa Version To V0.10.0 2023-04-05 00:59:20 +01:00
Yingzhi WANG 95fa2c9fd6
fix typo (#2475) 2023-04-03 23:31:09 +02:00
p0p 91cf1b2da9
[minor] batch["speaker_ids"] getting set two times (#2470)
* [minor] batch["speaker_ids"] getting set two times

just to make it consistent with language_ids

* Update vits.py

style.
2023-04-03 11:35:21 +02:00
Rajiv P c2d15cd413
[minor] hifigan_generator.py typo (#2462)
resblock2 description updated.
2023-03-28 12:43:36 +02:00
Eren Gölge d309f50e53
Implement FreeVC (#2451)
* Update .gitignore

* Draft FreeVC implementation

* Tests and relevant updates

* Update API tests

* Add missings

* Update requirements

* :(

* Lazy handle for vc

* Update docs for voice conversion

* Make style
2023-03-25 18:33:23 +01:00
Eren Gölge 090cadf270
Update numba version (#2435) 2023-03-21 11:40:42 +01:00
Khalid Bashir 14c80dd1fd
vits.py training fixed due to return_complex (#2418)
Torch set default value for `return_complex=True` for `torch.stft` method
This turned warning into error:-
```
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1591, in fit
    self._fit()
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1544, in _fit
    self.train_epoch()
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1309, in train_epoch
    _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time)
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1162, in train_step
    outputs, loss_dict_new, step_time = self._optimize(
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 1023, in _optimize
    outputs, loss_dict = self._model_train_step(batch, model, criterion, optimizer_idx=optimizer_idx)
  File "/usr/local/lib/python3.10/dist-packages/trainer/trainer.py", line 970, in _model_train_step
    return model.train_step(*input_args)
  File "/workspace/coqui-tts/TTS/tts/models/vits.py", line 1293, in train_step
    mel_slice_hat = wav_to_mel(
  File "/workspace/coqui-tts/TTS/tts/models/vits.py", line 191, in wav_to_mel
    spec = torch.stft(
  File "/usr/local/lib/python3.10/dist-packages/torch/functional.py", line 641, in stft
    return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
RuntimeError: stft requires the return_complex parameter be given for real inputs, and will further require that return_complex=True in a future PyTorch release.
```
2023-03-19 00:22:04 +01:00
Eren Gölge 12f3365185 Merge branch 'dev' 2023-03-17 13:31:08 +01:00
Eren Gölge 2db262747e
Bump up to v0.12.0 2023-03-17 13:21:03 +01:00
Roee Shenberg 3c15f0619a
Bug fixes in OverFlow audio generation (#2380) 2023-03-15 12:02:11 +01:00
Eren Gölge b8d9837d27
Merge pull request #2407 from dveni/patch-1
Update vits.py
2023-03-14 10:28:23 +01:00
Daniel Vera Nieto dfb48737fb Style fixed 2023-03-13 16:11:15 +01:00
Eren Gölge 9bb62c570d
Merge pull request #2390 from coqui-ai/dev
v0.12.0
2023-03-13 12:43:38 +01:00
Eren Gölge c10f9a3699
Update docs (#2389)
* Update docs index

* Add MarryTTS docs

* Update docs index

* Add Overflow docs
2023-03-13 12:42:20 +01:00
Eren Gölge 4ca07514d4
Remove doc bot (#2399) 2023-03-13 12:42:01 +01:00
Dani Vera 0d12229b64
Update vits.py
This should fix the issue https://github.com/coqui-ai/TTS/issues/1986 without breaking batch data sampling.
2023-03-10 18:35:16 +01:00
manmay nakhashi 624513018d
add energy by default to Fastspeech2 config (#2326)
* add energy by default

* added energy to base tts

* fix energy dataset

* fix styles

* fix test
2023-03-06 10:20:25 +01:00
Florian Quirin 478c8178b8
Basic Mary-TTS API compatibility (#2352)
* added basic Mary-TTS API endpoints to server

- imported `parse_qs` from `urllib.parse` to parse HTTP POST parameters
- imported `render_template_string` from `flask` to return text as endpoint result
- added new routes:
  - `/locales` - returns list of locales (currently locale of active model)
  - `/voices` - returns list of voices (currently locale and name of active model)
  - `/process` - accepts synth. request (GET and POST) with parameter `INPUT_TEXT` (other parameters ignored since we have only one active model)

* better log messages for Mary-TTS API

- smaller tweaks to log output

* use f-string in log print to please linter

* updated server.py to match 'make style' result
2023-03-06 10:08:21 +01:00
thennal10 d39bc74f57
OverFlow with test sentences (#2253)
* Fix typo in function definiton

* Swap hasattr out

hasattr(self, "speaker_manager")  and hasattr(self, "language_manager") seems to be redundant since BaseTTS defines both.
2023-03-01 09:11:30 +01:00
Edresson Casanova 16b9862252
Fix Speaker Consistency Loss (SCL) (#2364) 2023-02-27 09:14:00 +03:00
p0p4k a365a7e888
numpy version for py310 (#2316)
* numpy version for py310

requested in #2315

* Update requirements.txt
2023-02-13 10:34:00 +01:00
Eren G??lge d488b4f1c6 Merge branch 'dev' into main 2023-02-10 17:39:37 +01:00
Eren G??lge 661725b95e Bump up to v0.11.1 2023-02-10 15:59:05 +01:00
Eren G??lge 0196b4dfbf Merge branch 'add_neural_hmm_model' into dev 2023-02-10 15:23:56 +01:00
Eren G??lge ea5bd7dcbc Merge branch 'dev' into main 2023-02-10 10:27:34 +01:00
Eren Gölge 914280a556
Bump up to v0.11.0 (#2329)
* Make style

* Bump up to v0.11.0
2023-02-08 13:58:49 +01:00
Eren G??lge 6cfb590eb2 Merge branch 'dev' into main 2023-02-06 11:47:18 +01:00
Eren G??lge 683b4d432f Fixup 2023-02-06 11:44:56 +01:00
Eren G??lge c7184dcef9 Linter fix 2023-02-06 11:30:36 +01:00
Eren G??lge 910a218652 Merge branch 'dev' into main 2023-02-06 11:25:33 +01:00
Eren G??lge 4e75b6262c Update docs 2023-02-06 11:20:32 +01:00
Eren G??lge 85b3a04b37 Merge branch 'api_model_path' into dev 2023-02-06 11:18:00 +01:00
Eren G??lge c496b1a986 Linter fix 2023-02-06 11:17:28 +01:00
Eren G??lge baed2a2c2b Update README 2023-02-06 11:15:43 +01:00
marius851000 1f4d8bf0f1
Fix tts-server for multi-lingual models (#2257) 2023-02-06 10:54:34 +01:00
Eren G??lge 6ee94f8bad Fixup 2023-01-30 14:02:25 +01:00
Eren G??lge 713e8c8d04 Add pretrained model 2023-01-30 13:55:17 +01:00
Eren G??lge 7fddabc8ac Implement cloning in API 2023-01-30 13:35:48 +01:00
Eren G??lge 335b8ed44e Add vocoder path 2023-01-30 12:59:29 +01:00
Martin Weinelt 994be163e1
Use packaging.version for version comparisons (#2310)
* Use packaging.version for version comparisons

The distutils package is deprecated¹ and relies on PEP 386² version
comparisons, which have been superseded by PEP 440³ which is implemented
through the packaging module.

With more recent distutils versions, provided through setuptools
vendoring, we are seeing the following exception during version
comparisons:

> TypeError: '<' not supported between instances of 'str' and 'int'

This is fixed by this migration.

[1] https://docs.python.org/3/library/distutils.html
[2] https://peps.python.org/pep-0386/
[3] https://peps.python.org/pep-0440/

* Improve espeak version detection robustness

On many modern systems espeak is just a symlink to espeak-ng. In that
case looking for the 3rd word in the version output will break the
version comparison, when it finds `text-to-speech:`, instead of a proper
version.

This will not break during runtime, where espeak-ng would be
prioritized, but the phonemizer and tokenizer tests force the backend
to `espeak`, which exhibits this breakage.

This improves the version detection by simply looking for the version
after the "text-to-speech:" token.

* Replace distuils.copy_tree with shutil.copytree

The distutils module is deprecated and slated for removal in Python
3.12. Its usage should be replaced, in this case by a compatible method
from shutil.
2023-01-29 23:47:00 +01:00
Eren G??lge cf076345e7 Make style 2023-01-23 13:49:51 +01:00
Eren G??lge 13334d507c Load model from path 2023-01-23 13:45:45 +01:00
Gerard Sant Muniesa c59b3f75b8
Add Catalan text cleaners for Catalan support (#2295) 2023-01-23 11:56:30 +01:00
Shivam Mehta d83ee8fe45
Adding neural HMM TTS Model (#2272)
* Adding neural HMM TTS

* Adding tests

* Adding neural hmm on readme

* renaming training recipe

* Removing overflow\s decoder parameters from the config

* Update the Trainer requirement version for a compatible one (#2276)

* Bump up to v0.10.2

* Adding neural HMM TTS

* Adding tests

* Adding neural hmm on readme

* renaming training recipe

* Removing overflow\s decoder parameters from the config

* fixing documentation

Co-authored-by: Edresson Casanova <edresson1@gmail.com>
Co-authored-by: Eren Gölge <erogol@hotmail.com>
2023-01-23 11:53:04 +01:00
Eren Gölge 497f22b20b
Cache speaker encoder model (#2284) 2023-01-23 11:49:51 +01:00
Eren G??lge 6e3f74fc29 Fix #2191 2023-01-15 23:11:57 +01:00
manmay nakhashi bc422f2f3c
Fastspeech2 (#2073)
* added EnergyDataset

* add energy to Dataset

* add comupte_energy

* added energy params

* added energy to forward_tts

* added plot_avg_energy for visualisation

* Update forward_tts.py

* create file

* added fastspeech2 recipe

* add fastspeech2 config

* removed energy from fast pitch

* add energy loss to forward tts

* Update fastspeech2_config.py

* change run_name

* Update numpy_transforms.py

* fix typo

* fix typo

* fix typo

* linting issues

* use_energy default value --> False

* Update numpy_transforms.py

* linting fixes

* fix typo

* liniting_fix

* liniting_fix

* fix

* fixes

* fixes

* lint fix

* lint fixws

* added training test

* wrong import

* wrong import

* trailing whitespace

* style fix

* changed class name because of error

* class name change

* class name change

* change class name

* fixed styles
2023-01-15 22:39:22 +01:00
Eren Gölge 14d45b5347
Bump up to v0.10.2 2023-01-11 01:06:02 +01:00
Edresson Casanova 49dfaa5234
Update the Trainer requirement version for a compatible one (#2276) 2023-01-11 01:01:46 +01:00
Khalid Bashir 42afad5e79
Fixed bug related to yourtts speaker embeddings issue (#2234)
* Fixed bug related to yourtts speaker embeddings issue

* Reverted code for base_tts

* Bug fix on VITS d_vector_file type

* Ignore the test speakers on YourTTS recipe

* Add speaker encoder model and config on YourTTS recipe to easily do zero-shot inference

* Update YourTTS config file

* Update ModelManager._update_path to deal with list attributes

* Fix lint checks

* Remove unused code

* Fix unit tests

* Reset name_to_id to get the right speaker ids on load_embeddings_from_list_of_files

* Set weighted_sampler_multipliers as an empty dict to prevent users' mistakes

Co-authored-by: Edresson Casanova <edresson1@gmail.com>
2023-01-02 14:20:02 +01:00
Eren G??lge da93d768b8 Update docs 2023-01-02 10:07:03 +01:00
Julian Weber a07397733b
Multilingual tokenizer (#2229)
* Implement multilingual tokenizer

* Add multi_phonemizer receipe

* Fix lint

* Add TestMultiPhonemizer

* Fix lint

* make style
2023-01-02 10:03:19 +01:00
Eren Gölge a31af762e8
v0.10.1 (#2242)
* Add Ukrainian LADA (female) voice

* Add ca and fa models

* Add pth files to manager

* Bump up to v0.10.1

Co-authored-by: Yehor Smoliakov <yehors@ukr.net>
2022-12-26 15:46:21 +01:00
Eren G??lge f814d52394 Bump up to v0.10.1 2022-12-26 14:29:46 +01:00
Eren G??lge 8c32a6998a Add pth files to manager 2022-12-26 14:29:25 +01:00
Eren G??lge cf765cb3f2 Add ca and fa models 2022-12-26 14:29:10 +01:00
Eren Gölge 0910cb76bc
Merge pull request #2226 from egorsmkv/patch-1
Add Ukrainian LADA (female) voice
2022-12-16 13:17:16 +01:00
Yehor Smoliakov 046b137946
Add Ukrainian LADA (female) voice 2022-12-16 12:30:44 +02:00
Eren Gölge a04db8d632
Merge pull request #2205 from coqui-ai/dev
🚀 v0.10.0
2022-12-15 12:02:16 +01:00
Eren G??lge 46b0ad37e7 Bump up to v0.10.0 2022-12-15 11:19:23 +01:00
Eren Gölge a9167cf239
Fixup overflow (#2218)
* Update overflow config

* Pulling shuffle and drop_last  from config

* Print training stats for overflow
2022-12-15 00:56:48 +01:00
Eren Gölge ecea43ec81
Adding pre-trained Overflow model (#2211)
* Adding pretrained Overflow model

* Stabilize HMM

* Fixup model manager

* Return `audio_unique_name` by default

* Distribute max split size over datasets

* Fixup eval_split_size

* Make style
2022-12-14 16:55:48 +01:00
Edresson Casanova 061ac43187
Add Original YourTTS vocabulary for full transfer learning (#2206) 2022-12-13 09:02:10 +01:00
Edresson Casanova 3b1a28fa95
Add YourTTS VCTK recipe (#2198)
* Add YourTTS VCTK recipe

* Fix lint

* Add compute_embeddings and resample_files functions to be able to reuse it

* Add automatic download and speaker embedding computation for YourTTS VCTK recipe

* Add parameter for eval metadata file on compute embeddings function
2022-12-12 16:14:25 +01:00
Shivam Mehta 3b8b105b0d
Adding OverFlow (#2183)
* Adding encoder

* currently modifying hmm

* Adding hmm

* Adding overflow

* Adding overflow setting up flat start

* Removing runs

* adding normalization parameters

* Fixing models on same device

* Training overflow and plotting evaluations

* Adding inference

* At the end of epoch the test sentences are coming on cpu instead of gpu

* Adding figures from model during training to monitor

* reverting tacotron2 training recipe

* fixing inference on gpu for test sentences on config

* moving helpers and texts within overflows source code

* renaming to overflow

* moving loss to the model file

* Fixing the rename

* Model training but not plotting the test config sentences's audios

* Formatting logs

* Changing model name to camelcase

* Fixing test log

* Fixing plotting bug

* Adding some tests

* Adding more tests to overflow

* Adding all tests for overflow

* making changes to camel case in config

* Adding information about parameters and docstring

* removing compute_mel_statistics moved statistic computation to the model instead

* Added overflow in readme

* Adding more test cases, now it doesn't saves transition_p like tensor and can be dumped as json
2022-12-12 12:44:15 +01:00
p0p4k 2e153d54a8
Adding missing key to formatter (#2194)
quick fix for #2156.
 added 'root_path' key.
2022-12-12 12:25:37 +01:00
Eren Gölge 0c9fa2229b
Update README (#2204) 2022-12-12 12:20:50 +01:00
Eren Gölge 1ddc484b49
Python API implementation (#2195)
* Draft implementation

* Fix style

* Add api tests

* Fix lint

* Update docs

* Update tests

* Set env

* Fixup

* Fixup

* Fix lint

* Revert
2022-12-12 12:04:20 +01:00
Eren Gölge fdeefcc612
Handle espeak 1.48.15 (#2203) 2022-12-12 11:23:45 +01:00
Eren Gölge 24620743ca
Merge pull request #2187 from coqui-ai/dev-fix-vc 2022-12-06 21:27:34 +01:00
Eren Gölge c753ad49cc
Merge pull request #2189 from coqui-ai/fix-capacitron-test 2022-12-06 21:25:06 +01:00
WeberJulian 4787a2a993 Fix capacitron test when cuda is enabled 2022-12-06 18:07:48 +01:00
Edresson Casanova d2460de94b Fix unit tests 2022-12-05 09:59:11 -03:00
Edresson Casanova ee20e30958 Fix VITS multi-speaker voice conversion inference 2022-12-05 09:15:01 -03:00
Eren Gölge 9321b22203
Fix scheduler order 2022-12-05 12:26:15 +01:00
Eren Gölge c50d89fcf7
Merge pull request #2161 from coqui-ai/tutorials
fixed tutorial 2 incompatibility with new dev
2022-11-21 19:08:39 +01:00
Aya be0ba934ee cleared output 2022-11-21 14:50:07 +00:00
Aya 372605180e fixet tutorial2 incompatability with new dev 2022-11-21 14:44:40 +00:00
Eren Gölge 56ba616a03
Merge pull request #1942 from coqui-ai/dev
v0.9.0
2022-11-16 16:50:57 +01:00
Eren G??lge bc6120c330 [ci skip]Bump up to v0.9.0 2022-11-16 16:45:02 +01:00
Julian Weber 84b9b0879e
Fix documentation (#2154) 2022-11-16 16:13:07 +01:00
logan hart ff9b63d02a
Add neon models (#2140)
* Add neon ljspeech vits model

* Add neon german model

* Update .models.json

* Add neon spanish model

* Add french model

* Add Dutch model

* Add Hungarian model

* Add Greek model

* Remove uneeded description

* Update .models.json

* Update .models.json

* Handling neon models

* Add all neon models

* Update .models.json

* Split zoo_tests

* Update test names

* Update model testing

Co-authored-by: Eren Gölge <erogol@hotmail.com>
2022-11-16 16:12:39 +01:00
Eren Gölge a0f31df481
Fix README.md 2022-11-16 12:27:58 +01:00
Julian Weber 3191c5f1fe
Doc update docker (#2153)
* Complete Dockerignore to keep context managable

* Add documentation on readme

* Match pip and docker cuda version

* Use pip3 consistently
2022-11-16 00:21:56 +01:00
Eren Gölge 4114136717
Add docker docs 2022-11-15 23:17:30 +01:00
Julian Weber f85609f9bf
Make docker images lighter (#2149) 2022-11-15 00:11:32 +01:00
Eren Gölge 7689fadd86
Remove gitter link 2022-11-14 10:44:17 +01:00
Ikko Ashimine 42edcad45f
Update README.md (#2146)
Github -> GitHub
2022-11-14 10:41:27 +01:00
Eren Gölge 38c99f2507
Update dep caching in actions (#2138) 2022-11-09 22:15:11 +01:00
Eren Gölge 8cb1433e6e
Cache fsspec downloads (#2132)
* Cache fsspec downloaded files

* Use diff paths for test

* Make fsspec caching optional

* Decom GPU docker tests

* Make progress bar optional for better CI log

* Check path local
2022-11-09 22:12:48 +01:00
Eren Gölge c5412532ac
Remove langs expect en and de (#2135) 2022-11-09 11:58:34 +01:00
Eren Gölge c16804f5d0
Add Discord server badge (#2136) 2022-11-09 11:27:07 +01:00
Eren G??lge b686c09704 Fix #2062 2022-11-07 09:22:43 +01:00
freezerain fcbfca869f
Fix back/forward slash in file path in mailabs formatter (#1938)
* mailabs formatter: back/forward slash in file path fix

* formatters.mailabs() path rework for Windows os

* new formatter added "mailabs_win"

* lint test fix commit

* mailabs_win: removed, mailabs: "/" replaced with os.sep for windows compatibility

* Black small style fix
2022-11-01 12:54:40 +01:00
Victor Shepardson 5307a2229b
Fix Capacitron training (#2086) 2022-11-01 12:52:06 +01:00
Ahmed Husain 5ccef6e665
Use "formatter" key in the datasets json array (#2114)
Fix tutorial docs
2022-11-01 12:51:16 +01:00
Marek Šuppa fa0e71d0b6
Update forward_tts.md (#2019)
A small typo update in `forward_tts.md`
2022-10-29 12:52:24 +02:00
CeadeS 0207071f62
Update Tutorial_2_train_your_first_TTS_model.ipynb (#2079)
inconsistent metadata file name the metadata format example
2022-10-25 18:36:39 +02:00
Eren Gölge ef82adcc51
Adding announcements to README.md 2022-10-15 19:11:16 +02:00
Eren Gölge dae79b0acd
Remove `/` prefix from the relative path (#2065) 2022-10-10 13:32:27 +02:00
Eren Gölge 843fa6f3fa
Check num of columns in coqui format (#2066)
* Check 4 colums in coqui format

* Fix encoding

* Fixup
2022-10-10 12:13:32 +02:00
Edresson Casanova f3b947e706
Minors bug fixes on VITS/YourTTS and inference (#2054)
* Set the right device to the speaker encoder

* Bug fix on inference list_language_idxs parameter

* Bug fix on speaker encoder resample audio transform
2022-10-06 22:23:54 +02:00
Eren Gölge 5f5d441ee5
Write non-speech files in a TXT (#2048)
* Write non-speech files in a txt

* Save 16-bit wav out of vad
2022-10-06 13:25:54 +02:00
Edresson Casanova d6ad9a05b4
Fix colliding dataset cache file names (#1994)
* Fix colliding dataset cache file names

* Remove unused code
2022-09-21 12:54:07 +02:00
Edresson Casanova 3faccbda97
Fix dataset handling with the new embedding file keys (#1991) 2022-09-19 23:44:14 +02:00
Eren Gölge 0a112f7841
Add metafile arg (#1977) 2022-09-16 14:41:49 +02:00
Eren Gölge dba2c3570a
Update readme (#1978) 2022-09-16 12:01:46 +02:00
Julian Weber 896e46d0e5
Fix vc (#1971) 2022-09-16 12:01:26 +02:00
Eren Gölge b95cf3363c
Prevent installing mecab-ko (#1967) 2022-09-14 10:28:07 +02:00
Eren Gölge 9e5a469c64
d-vector handling (#1945)
* Update BaseDatasetConfig

- Add dataset_name
- Chane name to formatter_name

* Update compute_embedding

- Allow entering dataset by args
- Use released model by default
- Use the new key format

* Update loading

* Update recipes

* Update other dep code

* Update tests

* Fixup

* Load multiple embedding files

* Fix argument names in dep code

* Update docs

* Fix argument name

* Fix linter
2022-09-13 14:10:33 +02:00
Edresson Casanova 371772c355
Replace pyworld by pyin (#1946)
* Replace pyworld by pyin

* Fix unit tests
2022-09-09 10:43:14 +02:00
happylittlecat 4546b4cbd8
Add espeak support for Chinese (#1905)
* fix description

* add espeak support for chinese

* add espeak support for chinese
2022-09-08 12:32:41 +02:00
harmlessman 5abbe56642
Korean Phonemizer (#1822)
* Update requirements.txt

install jamo for korean

* Update formatters.py

add KSS formatter

KSS is a korean single speech dataset (12hours)

* Add files via upload

add phonemizer for korean

* Add files via upload

add korean phonemizer

* Update requirements.txt

* change code style with `black` and `pylint`

* reflecting pylint's Evaluation

* reflecting pylint's Evaluation

* reflecting pylint's Evaluation-2

* isort

* edit about separator
write test case and add 'nltk' for requirements.txt

* add korean g2p (g2pkk)

* isort

* TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py:43:24: W0621: Redefining name 'text' from outer scope (line 58) (redefined-outer-name)

TTS/tts/utils/text/korean/korean.py:28:8: R1705: Unnecessary "else" after "return" (no-else-return)

* black
2022-09-08 12:06:07 +02:00
Edresson Casanova 98aa6261d1
Add YourTTS and SC-GlowTTS on available models (#1933) 2022-09-08 11:10:39 +02:00
Edresson Casanova 159eeeef64
Fix find unique phonemes script (#1928)
* Fix find unique phonemes script

* Fix unit tests
2022-09-08 10:17:35 +02:00
KyuubiYoru 3b7dff568a
Fixes a race condition with multiple simultaneous get requests. (#1807)
* Fixes a race condition with multiple simultaneous get requests.

* Removed unused import

* Removed unused threading import

* Changed lock style to notation

* make style

Co-authored-by: WeberJulian <julian.weber@hotmail.fr>
2022-09-08 10:16:16 +02:00
Julian Weber bb59718c03
Add capacitron v2 model (#1768)
* Add capacitron v2 in .models.json

* Put right commit hash
2022-09-08 09:43:56 +02:00
Edresson Casanova 096b35f639
Add VCTK speaker encoder recipe (#1912) 2022-08-26 16:19:03 +02:00
Eren Gölge e5430a6519
Add new DE Thorsten models (#1898)
- Tacotron2-DDC
- HifiGAN vocoder
2022-08-22 11:27:39 +02:00
Eren G??lge 8845f06fd9 Bump up to v0.8.0 2022-08-22 11:26:47 +02:00
480 changed files with 152112 additions and 2066 deletions

View File

@ -1,2 +1,9 @@
.git/
Dockerfile
build/
dist/
TTS.egg-info/
tests/outputs/*
tests/train_outputs/*
__pycache__/
*.pyc

View File

@ -18,12 +18,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
@ -31,6 +31,8 @@ jobs:
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update

View File

@ -18,12 +18,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
@ -31,6 +31,8 @@ jobs:
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update

View File

@ -15,8 +15,8 @@ jobs:
matrix:
arch: ["amd64"]
base:
- "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled
- "ubuntu:20.04" # CPU only
- "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
- "python:3.10.8-slim" # CPU only
steps:
- uses: actions/checkout@v2
- name: Log in to the Container registry
@ -32,7 +32,7 @@ jobs:
base="ghcr.io/coqui-ai/tts"
tags="" # PR build
if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then
if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
base="ghcr.io/coqui-ai/tts-cpu"
fi

View File

@ -18,12 +18,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
@ -31,10 +31,14 @@ jobs:
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: |
export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends git make gcc
sudo apt-get install espeak-ng
make system-deps
- name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel

View File

@ -10,7 +10,7 @@ jobs:
build-sdist:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Verify tag matches version
run: |
set -ex
@ -21,7 +21,7 @@ jobs:
fi
- uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: 3.9
- run: |
python -m pip install -U pip setuptools wheel build
- run: |
@ -36,9 +36,9 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
@ -64,14 +64,6 @@ jobs:
with:
name: "sdist"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.7"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.8"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.9"
@ -80,6 +72,10 @@ jobs:
with:
name: "wheel-3.10"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.11"
path: "dist/"
- run: |
ls -lh dist/
- name: Setup PyPI config
@ -91,7 +87,7 @@ jobs:
EOF
- uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: 3.9
- run: |
python -m pip install twine
- run: |

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.9]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
@ -42,6 +42,5 @@ jobs:
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Lint check
run: |
make lint
- name: Style check
run: make style

View File

@ -18,12 +18,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
@ -31,6 +31,8 @@ jobs:
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update

View File

@ -18,12 +18,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
@ -31,6 +31,8 @@ jobs:
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update

53
.github/workflows/tts_tests2.yml vendored Normal file
View File

@ -0,0 +1,53 @@
name: tts-tests2
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: 'pip'
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends git make gcc
sudo apt-get install espeak
sudo apt-get install espeak-ng
make system-deps
- name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: make test_tts2

View File

@ -18,12 +18,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
@ -31,6 +31,8 @@ jobs:
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update

53
.github/workflows/xtts_tests.yml vendored Normal file
View File

@ -0,0 +1,53 @@
name: xtts-tests
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: 'pip'
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends git make gcc
sudo apt-get install espeak
sudo apt-get install espeak-ng
make system-deps
- name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: make test_xtts

54
.github/workflows/zoo_tests0.yml vendored Normal file
View File

@ -0,0 +1,54 @@
name: zoo-tests-0
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: 'pip'
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y git make gcc
sudo apt-get install espeak espeak-ng
make system-deps
- name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: |
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion

53
.github/workflows/zoo_tests1.yml vendored Normal file
View File

@ -0,0 +1,53 @@
name: zoo-tests-1
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: 'pip'
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y git make gcc
sudo apt-get install espeak espeak-ng
make system-deps
- name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3

View File

@ -1,4 +1,4 @@
name: zoo-tests
name: zoo-tests-2
on:
push:
@ -18,12 +18,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
@ -31,6 +31,8 @@ jobs:
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: set ENV
run: export TRAINER_TELEMETRY=0
- name: Install dependencies
run: |
sudo apt-get update
@ -47,4 +49,4 @@ jobs:
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: make test_zoo
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3

4
.gitignore vendored
View File

@ -137,7 +137,7 @@ VCTK-Corpus-removed-silence/*
# ignore training logs
trainer_*_log.txt
# files used internally fro dev, test etc.
# files used internally for dev, test etc.
tests/outputs/*
tests/train_outputs/*
TODO.txt
@ -168,3 +168,5 @@ internal/*
wandb
depot/*
coqui_recipes/*
local_scripts/*
coqui_demos/*

View File

@ -6,7 +6,7 @@ repos:
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: 'https://github.com/psf/black'
rev: 20.8b1
rev: 22.3.0
hooks:
- id: black
language_version: python3

View File

@ -169,7 +169,9 @@ disable=missing-docstring,
comprehension-escape,
duplicate-code,
not-callable,
import-outside-toplevel
import-outside-toplevel,
logging-fstring-interpolation,
logging-not-lazy
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option

View File

@ -5,14 +5,19 @@
# Required
version: 2
# Set the version of Python and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.11"
# Optionally set the version of Python and requirements required to build your docs
python:
install:
- requirements: docs/requirements.txt
- requirements: requirements.txt
# Build documentation in the docs/ directory with Sphinx
sphinx:
builder: html
configuration: docs/source/conf.py
# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.7
install:
- requirements: docs/requirements.txt
- requirements: requirements.txt

View File

@ -36,7 +36,7 @@ This model can be shared in two ways:
Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380).
Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/discussions/930).
## Sending a ✨**PR**✨
@ -48,7 +48,7 @@ The following steps are tested on an Ubuntu system.
1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.
```bash
$ git clone git@github.com:<your Github name>/TTS.git
@ -128,6 +128,32 @@ The following steps are tested on an Ubuntu system.
14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
## Development in Docker container
If you prefer working within a Docker container as your development environment, you can do the following:
1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
```bash
$ git clone git@github.com:<your Github name>/TTS.git
$ cd TTS
$ git remote add upstream https://github.com/coqui-ai/TTS.git
```
3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
```
docker build --tag=tts-dev:latest -f .\dockerfiles\Dockerfile.dev .
```
4. Run the container with GPU support:
```
docker run -it --gpus all tts-dev:latest /bin/bash
```
Feel free to ping us at any step you need help using our communication channels.
If you are new to Github or open-source contribution, These are good resources.

View File

@ -1,20 +1,19 @@
ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3
ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
FROM ${BASE}
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
RUN pip install llvmlite --ignore-installed
# Create and activate virtual env
ENV VIRTUAL_ENV=/venv
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN pip install -U pip setuptools wheel
RUN apt-get update && apt-get upgrade -y
RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
RUN pip3 install llvmlite --ignore-installed
# Install Dependencies:
RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
RUN rm -rf /root/.cache/pip
# Copy TTS repository contents:
WORKDIR /root
COPY requirements.txt /root
COPY requirements.dev.txt /root
COPY requirements.notebooks.txt /root
RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
COPY . /root
RUN make install
ENTRYPOINT ["tts"]
CMD ["--help"]

View File

@ -19,6 +19,12 @@ test_vocoder: ## run vocoder tests.
test_tts: ## run tts tests.
nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
test_tts2: ## run tts tests.
nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests2
test_xtts:
nose2 -F -v -B --with-coverage --coverage TTS tests.xtts_tests
test_aux: ## run aux tests.
nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
./run_bash_tests.sh

324
README.md
View File

@ -1,9 +1,29 @@
# <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
## 🐸Coqui.ai News
- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
- 📣 ⓍTTS can now stream with <200ms latency.
- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
[![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
<div align="center">
<img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
**🐸TTS is a library for advanced Text-to-Speech generation.**
🚀 Pretrained models in +1100 languages.
🛠️ Tools for training new models and fine-tuning existing models in any language.
📚 Utilities for dataset analysis and curation.
______________________________________________________________________
[![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
[![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
[![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
@ -18,16 +38,14 @@
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/text_tests.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/tts_tests.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/vocoder_tests.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests0.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests1.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests2.yml/badge.svg)
[![Docs](<https://readthedocs.org/projects/tts/badge/?version=latest&style=plastic>)](https://tts.readthedocs.io/en/latest/)
📰 [**Subscribe to 🐸Coqui.ai Newsletter**](https://coqui.ai/?subscription=true)
</div>
📢 [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2)
📄 [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers)
<img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
______________________________________________________________________
## 💬 Where to ask questions
Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it.
@ -36,12 +54,12 @@ Please use our dedicated channels for questions and discussion. Help is much mor
| ------------------------------- | --------------------------------------- |
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
| 👩‍💻 **Usage Questions** | [Github Discussions] |
| 🗯 **General Discussion** | [Github Discussions] or [Gitter Room] |
| 👩‍💻 **Usage Questions** | [GitHub Discussions] |
| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
[github issue tracker]: https://github.com/coqui-ai/tts/issues
[github discussions]: https://github.com/coqui-ai/TTS/discussions
[gitter room]: https://gitter.im/coqui-ai/TTS?utm_source=share-link&utm_medium=link&utm_campaign=share-link
[discord]: https://discord.gg/5eXr5seRrv
[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
@ -49,16 +67,17 @@ Please use our dedicated channels for questions and discussion. Help is much mor
| Type | Links |
| ------------------------------- | --------------------------------------- |
| 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#install-tts)|
| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)|
| 👩‍💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)|
| 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
| 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)|
| 📰 **Papers** | [TTS Papers](https://github.com/erogol/TTS-papers)|
## 🥇 TTS Performance
<p align="center"><img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/TTS-performance.png" width="800" /></p>
Underlined "TTS*" and "Judy*" are 🐸TTS models
<!-- [Details...](https://github.com/coqui-ai/TTS/wiki/Mean-Opinion-Score-Results) -->
Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not released open-source. They are here to show the potential. Models prefixed with a dot (.Jofish .Abe and .Janice) are real human voices.
## Features
- High-performance Deep Learning models for Text2Speech tasks.
@ -74,8 +93,8 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
- Utilities to use and test your models.
- Modular (but not too much) code base enabling easy implementation of new ideas.
## Implemented Models
### Text-to-Spectrogram
## Model Implementations
### Spectrogram models
- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
@ -83,9 +102,19 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
- FastSpeech2: [paper](https://arxiv.org/abs/2006.04558)
- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
- Capacitron: [paper](https://arxiv.org/abs/1906.03402)
- OverFlow: [paper](https://arxiv.org/abs/2211.06892)
- Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612)
### End-to-End Models
- ⓍTTS: [blog](https://coqui.ai/blog/tts/open_xtts)
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
- 🐸 YourTTS: [paper](https://arxiv.org/abs/2112.02418)
- 🐢 Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)
- 🐶 Bark: [orig. repo](https://github.com/suno-ai/bark)
### Attention Methods
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
@ -109,10 +138,13 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
- HiFiGAN: [paper](https://arxiv.org/abs/2010.05646)
- UnivNet: [paper](https://arxiv.org/abs/2106.07889)
### Voice Conversion
- FreeVC: [paper](https://arxiv.org/abs/2210.15418)
You can also help us implement more models.
## Install TTS
🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**.
## Installation
🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**.
If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
@ -136,101 +168,225 @@ $ make install
If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
## Use TTS
### Single Speaker Models
## Docker Image
You can also try TTS without install with the docker image.
Simply run the following command and you will be able to run TTS without installing it.
```bash
docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
python3 TTS/server/server.py --list_models #To get the list of available models
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
```
You can then enjoy the TTS server [here](http://[::1]:5002/)
More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
## Synthesizing speech by 🐸TTS
### 🐍 Python API
#### Running a multi-speaker and multi-lingual model
```python
import torch
from TTS.api import TTS
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
# List available 🐸TTS models
print(TTS().list_models())
# Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# Run TTS
# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
# Text to speech list of amplitude values as output
wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
# Text to speech to a file
tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
```
#### Running a single speaker model
```python
# Init TTS with the target model name
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False).to(device)
# Run TTS
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
# Example voice cloning with YourTTS in English, French and Portuguese
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
```
#### Example voice conversion
Converting the voice in `source_wav` to the voice of `target_wav`
```python
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")
tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
```
#### Example voice cloning together with the voice conversion model.
This way, you can clone voices by using any model in 🐸TTS.
```python
tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
tts.tts_with_vc_to_file(
"Wie sage ich auf Italienisch, dass ich dich liebe?",
speaker_wav="target/speaker.wav",
file_path="output.wav"
)
```
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
```python
# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
"Wie sage ich auf Italienisch, dass ich dich liebe?",
speaker_wav="target/speaker.wav",
file_path="output.wav"
)
```
### Command-line `tts`
<!-- begin-tts-readme -->
Synthesize speech on command line.
You can either use your trained model or choose a model from the provided list.
If you don't specify any models, then it uses LJSpeech based English model.
#### Single Speaker Models
- List provided models:
```
$ tts --list_models
```
```
$ tts --list_models
```
- Get model info (for both tts_models and vocoder_models):
- Query by type/name:
The model_info_by_name uses the name as it from the --list_models.
```
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
```
For example:
```
$ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
```
```
$ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
```
- Query by type/idx:
The model_query_idx uses the corresponding idx from --list_models.
```
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
```
For example:
```
$ tts --model_info_by_idx tts_models/3
```
- Run TTS with default models:
- Query by type/name:
The model_info_by_name uses the name as it from the --list_models.
```
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
```
For example:
```
$ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
$ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
```
- Query by type/idx:
The model_query_idx uses the corresponding idx from --list_models.
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
```
For example:
```
$ tts --model_info_by_idx tts_models/3
```
- Query info for model info by full name:
```
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
```
- Run TTS with default models:
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav
```
- Run TTS and pipe out the generated TTS wav file data:
```
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
```
- Run a TTS model with its default vocoder model:
```
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
```
```
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
```
For example:
```
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
```
```
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
```
- Run with specific TTS and vocoder models from the list:
```
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
```
```
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
```
For example:
```
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
```
```
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
```
- Run your own TTS model (Using Griffin-Lim Vocoder):
```
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
```
```
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
```
- Run your own TTS and Vocoder models:
```
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
```
### Multi-speaker Models
```
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
```
- List the available speakers and choose as <speaker_id> among them:
#### Multi-speaker Models
```
$ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
```
- List the available speakers and choose a <speaker_id> among them:
```
$ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
```
- Run the multi-speaker TTS model with the target speaker ID:
```
$ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
```
```
$ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
```
- Run your own multi-speaker TTS model:
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
```
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
```
### Voice Conversion Models
```
$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
```
<!-- end-tts-readme -->
## Directory Structure
```
@ -239,8 +395,6 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
|- TTS
|- bin/ (folder for all the executables.)
|- train*.py (train your target model.)
|- distribute.py (train your TTS model using Multiple GPUs.)
|- compute_statistics.py (compute dataset statistics for normalization.)
|- ...
|- tts/ (text to speech models)
|- layers/ (model layer definitions)

View File

@ -1,14 +1,115 @@
{
"tts_models": {
"multilingual":{
"multi-dataset":{
"your_tts":{
"multilingual": {
"multi-dataset": {
"xtts_v2": {
"description": "XTTS-v2.0.3 by Coqui with 17 languages.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
],
"model_hash": "10f92b55c512af7a8d39d650547a15a7",
"default_vocoder": null,
"commit": "480a6cdf7",
"license": "CPML",
"contact": "info@coqui.ai",
"tos_required": true
},
"xtts_v1.1": {
"description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
],
"model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
"default_vocoder": null,
"commit": "82910a63",
"license": "CPML",
"contact": "info@coqui.ai",
"tos_required": true
},
"your_tts": {
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
"default_vocoder": null,
"commit": "e9a1953e",
"license": "CC BY-NC-ND 4.0",
"contact": "egolge@coqui.ai"
},
"bark": {
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
"https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
"https://coqui.gateway.scarf.sh/hf/text_2.pt",
"https://coqui.gateway.scarf.sh/hf/bark/config.json",
"https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
"https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
],
"default_vocoder": null,
"commit": "e9a1953e",
"license": "MIT",
"contact": "https://www.suno.ai/"
}
}
},
"bg": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"cs": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"da": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"et": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"ga": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
@ -79,6 +180,14 @@
"license": "apache 2.0",
"contact": "egolge@coqui.com"
},
"vits--neon": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
"default_vocoder": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause",
"contact": null,
"commit": null
},
"fast_pitch": {
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
@ -87,6 +196,24 @@
"author": "Eren Gölge @erogol",
"license": "apache 2.0",
"contact": "egolge@coqui.com"
},
"overflow": {
"description": "Overflow model trained on LJSpeech",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
"commit": "3b1a28f",
"author": "Eren Gölge @erogol",
"license": "apache 2.0",
"contact": "egolge@coqui.ai"
},
"neural_hmm": {
"description": "Neural HMM model trained on LJSpeech",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
"commit": "3b1a28f",
"author": "Shivam Metha @shivammehta25",
"license": "apache 2.0",
"contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
}
},
"vctk": {
@ -99,7 +226,7 @@
"license": "apache 2.0",
"contact": "egolge@coqui.ai"
},
"fast_pitch":{
"fast_pitch": {
"description": "FastPitch model trained on VCTK dataseset.",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
"default_vocoder": null,
@ -130,15 +257,45 @@
"license": "apache 2.0",
"contact": "adamfroghyar@gmail.com"
},
"capacitron-t2-c150": {
"capacitron-t2-c150_v2": {
"description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
"commit": "d6284e7",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
"commit": "a67039d",
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
"author": "Adam Froghyar @a-froghyar",
"license": "apache 2.0",
"contact": "adamfroghyar@gmail.com"
}
},
"multi-dataset": {
"tortoise-v2": {
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
"github_rls_url": [
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
],
"commit": "c1875f6",
"default_vocoder": null,
"author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
"license": "apache 2.0"
}
},
"jenny": {
"jenny": {
"description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
"default_vocoder": null,
"commit": "ba40a1c",
"license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
"author": "@noml4u"
}
}
},
"es": {
@ -151,6 +308,15 @@
"license": "MPL",
"contact": "egolge@coqui.com"
}
},
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"fr": {
@ -158,22 +324,38 @@
"tacotron2-DDC": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
"commit": "",
"commit": null,
"author": "Eren Gölge @erogol",
"license": "MPL",
"contact": "egolge@coqui.com"
}
},
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"uk":{
"uk": {
"mai": {
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
"author":"@robinhad",
"author": "@robinhad",
"commit": "bdab788d",
"license": "MIT",
"contact": "",
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
},
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
@ -198,6 +380,15 @@
"stats_file": null,
"commit": "540d811"
}
},
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"de": {
@ -215,6 +406,23 @@
"author": "@thorstenMueller",
"license": "apache 2.0",
"commit": "unknown"
},
"tacotron2-DDC": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
"default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
"description": "Thorsten-Dec2021-22k-DDC",
"author": "@thorstenMueller",
"license": "apache 2.0",
"commit": "unknown"
}
},
"css10": {
"vits-neon": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
"default_vocoder": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause",
"commit": null
}
}
},
@ -230,9 +438,9 @@
}
}
},
"tr":{
"tr": {
"common-voice": {
"glow-tts":{
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
"default_vocoder": "vocoder_models/tr/common-voice/hifigan",
"license": "MIT",
@ -244,7 +452,7 @@
},
"it": {
"mai_female": {
"glow-tts":{
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -252,7 +460,7 @@
"license": "apache 2.0",
"commit": null
},
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -262,7 +470,7 @@
}
},
"mai_male": {
"glow-tts":{
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -270,7 +478,7 @@
"license": "apache 2.0",
"commit": null
},
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -282,7 +490,7 @@
},
"ewe": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -294,7 +502,7 @@
},
"hau": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -306,7 +514,7 @@
},
"lin": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -318,7 +526,7 @@
},
"tw_akuapem": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -330,7 +538,7 @@
},
"tw_asante": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -342,7 +550,7 @@
},
"yor": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -351,6 +559,205 @@
"commit": "1b22f03"
}
}
},
"hu": {
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"el": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"fi": {
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"hr": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"lt": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"lv": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"mt": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"pl": {
"mai_female": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"pt": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"ro": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"sk": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"sl": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"sv": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"ca": {
"custom": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
"default_vocoder": null,
"commit": null,
"description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
"author": "@gullabi",
"license": "CC-BY-4.0"
}
}
},
"fa": {
"custom": {
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
"default_vocoder": null,
"commit": null,
"description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
"author": "@karim23657",
"license": "CC-BY-4.0"
}
}
},
"bn": {
"custom": {
"vits-male": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
"default_vocoder": null,
"commit": null,
"description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
"author": "@mobassir94",
"license": "Apache 2.0"
},
"vits-female": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
"default_vocoder": null,
"commit": null,
"description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
"author": "@mobassir94",
"license": "Apache 2.0"
}
}
},
"be": {
"common-voice": {
"glow-tts":{
"description": "Belarusian GlowTTS model created by @alex73 (Github).",
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
"default_vocoder": "vocoder_models/be/common-voice/hifigan",
"commit": "c0aabb85",
"license": "CC-BY-SA 4.0",
"contact": "alex73mail@gmail.com"
}
}
}
},
"vocoder_models": {
@ -460,6 +867,13 @@
"author": "@thorstenMueller",
"license": "apache 2.0",
"commit": "unknown"
},
"hifigan_v1": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
"description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
"author": "@thorstenMueller",
"license": "apache 2.0",
"commit": "unknown"
}
}
},
@ -478,16 +892,16 @@
"mai": {
"multiband-melgan": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
"author":"@robinhad",
"author": "@robinhad",
"commit": "bdab788d",
"license": "MIT",
"contact": ""
}
}
},
"tr":{
"tr": {
"common-voice": {
"hifigan":{
"hifigan": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
"description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
"author": "Fatih Akademi",
@ -495,6 +909,30 @@
"commit": null
}
}
},
"be": {
"common-voice": {
"hifigan": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
"description": "Belarusian HiFiGAN model created by @alex73 (Github).",
"author": "@alex73",
"license": "CC-BY-SA 4.0",
"commit": "c0aabb85"
}
}
}
},
"voice_conversion_models": {
"multilingual": {
"vctk": {
"freevc24": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
"description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
"author": "Jing-Yi Li @OlaWod",
"license": "MIT",
"commit": null
}
}
}
}
}
}

View File

@ -1 +1 @@
0.7.1
0.22.0

458
TTS/api.py Normal file
View File

@ -0,0 +1,458 @@
import tempfile
import warnings
from pathlib import Path
from typing import Union
import numpy as np
from torch import nn
from TTS.utils.audio.numpy_transforms import save_wav
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from TTS.config import load_config
class TTS(nn.Module):
"""TODO: Add voice conversion and Capacitron support."""
def __init__(
self,
model_name: str = "",
model_path: str = None,
config_path: str = None,
vocoder_path: str = None,
vocoder_config_path: str = None,
progress_bar: bool = True,
gpu=False,
):
"""🐸TTS python interface that allows to load and use the released models.
Example with a multi-speaker model:
>>> from TTS.api import TTS
>>> tts = TTS(TTS.list_models()[0])
>>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
>>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
Example with a single-speaker model:
>>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
Example loading a model from a path:
>>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
Example voice cloning with YourTTS in English, French and Portuguese:
>>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
>>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
>>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
>>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
>>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
>>> tts.tts_to_file("This is a test.", file_path="output.wav")
Args:
model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
model_path (str, optional): Path to the model checkpoint. Defaults to None.
config_path (str, optional): Path to the model config. Defaults to None.
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
super().__init__()
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
self.config = load_config(config_path) if config_path else None
self.synthesizer = None
self.voice_converter = None
self.model_name = ""
if gpu:
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
if model_name is not None and len(model_name) > 0:
if "tts_models" in model_name:
self.load_tts_model_by_name(model_name, gpu)
elif "voice_conversion_models" in model_name:
self.load_vc_model_by_name(model_name, gpu)
else:
self.load_model_by_name(model_name, gpu)
if model_path:
self.load_tts_model_by_path(
model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
)
@property
def models(self):
return self.manager.list_tts_models()
@property
def is_multi_speaker(self):
if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
return False
@property
def is_multi_lingual(self):
# Not sure what sets this to None, but applied a fix to prevent crashing.
if (
isinstance(self.model_name, str)
and "xtts" in self.model_name
or self.config
and ("xtts" in self.config.model or len(self.config.languages) > 1)
):
return True
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
return self.synthesizer.tts_model.language_manager.num_languages > 1
return False
@property
def speakers(self):
if not self.is_multi_speaker:
return None
return self.synthesizer.tts_model.speaker_manager.speaker_names
@property
def languages(self):
if not self.is_multi_lingual:
return None
return self.synthesizer.tts_model.language_manager.language_names
@staticmethod
def get_models_file_path():
return Path(__file__).parent / ".models.json"
def list_models(self):
return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
def download_model_by_name(self, model_name: str):
model_path, config_path, model_item = self.manager.download_model(model_name)
if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
# return model directory if there are multiple files
# we assume that the model knows how to load itself
return None, None, None, None, model_path
if model_item.get("default_vocoder") is None:
return model_path, config_path, None, None, None
vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
return model_path, config_path, vocoder_path, vocoder_config_path, None
def load_model_by_name(self, model_name: str, gpu: bool = False):
"""Load one of the 🐸TTS models by name.
Args:
model_name (str): Model name to load. You can list models by ```tts.models```.
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
self.load_tts_model_by_name(model_name, gpu)
def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
"""Load one of the voice conversion models by name.
Args:
model_name (str): Model name to load. You can list models by ```tts.models```.
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
self.model_name = model_name
model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
"""Load one of 🐸TTS models by name.
Args:
model_name (str): Model name to load. You can list models by ```tts.models```.
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
TODO: Add tests
"""
self.synthesizer = None
self.model_name = model_name
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
model_name
)
# init synthesizer
# None values are fetch from the model
self.synthesizer = Synthesizer(
tts_checkpoint=model_path,
tts_config_path=config_path,
tts_speakers_file=None,
tts_languages_file=None,
vocoder_checkpoint=vocoder_path,
vocoder_config=vocoder_config_path,
encoder_checkpoint=None,
encoder_config=None,
model_dir=model_dir,
use_cuda=gpu,
)
def load_tts_model_by_path(
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
):
"""Load a model from a path.
Args:
model_path (str): Path to the model checkpoint.
config_path (str): Path to the model config.
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
self.synthesizer = Synthesizer(
tts_checkpoint=model_path,
tts_config_path=config_path,
tts_speakers_file=None,
tts_languages_file=None,
vocoder_checkpoint=vocoder_path,
vocoder_config=vocoder_config,
encoder_checkpoint=None,
encoder_config=None,
use_cuda=gpu,
)
def _check_arguments(
self,
speaker: str = None,
language: str = None,
speaker_wav: str = None,
emotion: str = None,
speed: float = None,
**kwargs,
) -> None:
"""Check if the arguments are valid for the model."""
# check for the coqui tts models
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
if self.is_multi_lingual and language is None:
raise ValueError("Model is multi-lingual but no `language` is provided.")
if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
if not self.is_multi_lingual and language is not None:
raise ValueError("Model is not multi-lingual but `language` is provided.")
if not emotion is None and not speed is None:
raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
def tts(
self,
text: str,
speaker: str = None,
language: str = None,
speaker_wav: str = None,
emotion: str = None,
speed: float = None,
split_sentences: bool = True,
**kwargs,
):
"""Convert text to speech.
Args:
text (str):
Input text to synthesize.
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS` model.
speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None.
emotion (str, optional):
Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
speed (float, optional):
Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
Defaults to None.
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
kwargs (dict, optional):
Additional arguments for the model.
"""
self._check_arguments(
speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
)
wav = self.synthesizer.tts(
text=text,
speaker_name=speaker,
language_name=language,
speaker_wav=speaker_wav,
reference_wav=None,
style_wav=None,
style_text=None,
reference_speaker_name=None,
split_sentences=split_sentences,
**kwargs,
)
return wav
def tts_to_file(
self,
text: str,
speaker: str = None,
language: str = None,
speaker_wav: str = None,
emotion: str = None,
speed: float = 1.0,
pipe_out=None,
file_path: str = "output.wav",
split_sentences: bool = True,
**kwargs,
):
"""Convert text to speech.
Args:
text (str):
Input text to synthesize.
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
language (str, optional):
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None.
emotion (str, optional):
Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
speed (float, optional):
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
pipe_out (BytesIO, optional):
Flag to stdout the generated TTS wav file for shell pipe.
file_path (str, optional):
Output file path. Defaults to "output.wav".
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
kwargs (dict, optional):
Additional arguments for the model.
"""
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
wav = self.tts(
text=text,
speaker=speaker,
language=language,
speaker_wav=speaker_wav,
split_sentences=split_sentences,
**kwargs,
)
self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
return file_path
def voice_conversion(
self,
source_wav: str,
target_wav: str,
):
"""Voice conversion with FreeVC. Convert source wav to target speaker.
Args:``
source_wav (str):
Path to the source wav file.
target_wav (str):`
Path to the target wav file.
"""
wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
return wav
def voice_conversion_to_file(
self,
source_wav: str,
target_wav: str,
file_path: str = "output.wav",
):
"""Voice conversion with FreeVC. Convert source wav to target speaker.
Args:
source_wav (str):
Path to the source wav file.
target_wav (str):
Path to the target wav file.
file_path (str, optional):
Output file path. Defaults to "output.wav".
"""
wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
return file_path
def tts_with_vc(
self,
text: str,
language: str = None,
speaker_wav: str = None,
speaker: str = None,
split_sentences: bool = True,
):
"""Convert text to speech with voice conversion.
It combines tts with voice conversion to fake voice cloning.
- Convert text to speech with tts.
- Convert the output wav to target speaker with voice conversion.
Args:
text (str):
Input text to synthesize.
language (str, optional):
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None.
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
"""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
# Lazy code... save it to a temp file to resample it while reading it for VC
self.tts_to_file(
text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
)
if self.voice_converter is None:
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
return wav
def tts_with_vc_to_file(
self,
text: str,
language: str = None,
speaker_wav: str = None,
file_path: str = "output.wav",
speaker: str = None,
split_sentences: bool = True,
):
"""Convert text to speech with voice conversion and save to file.
Check `tts_with_vc` for more details.
Args:
text (str):
Input text to synthesize.
language (str, optional):
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None.
file_path (str, optional):
Output file path. Defaults to "output.wav".
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
split_sentences (bool, optional):
Split text into sentences, synthesize them separately and concatenate the file audio.
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
applicable to the 🐸TTS models. Defaults to True.
"""
wav = self.tts_with_vc(
text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
)
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)

View File

@ -6,79 +6,192 @@ import torch
from tqdm import tqdm
from TTS.config import load_config
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.managers import save_file
from TTS.tts.utils.speakers import SpeakerManager
parser = argparse.ArgumentParser(
description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
"""
Example runs:
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json
""",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
parser.add_argument("config_path", type=str, help="Path to model config file.")
parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
args = parser.parse_args()
def compute_embeddings(
model_path,
config_path,
output_path,
old_speakers_file=None,
old_append=False,
config_dataset_path=None,
formatter_name=None,
dataset_name=None,
dataset_path=None,
meta_file_train=None,
meta_file_val=None,
disable_cuda=False,
no_eval=False,
):
use_cuda = torch.cuda.is_available() and not disable_cuda
use_cuda = torch.cuda.is_available() and not args.disable_cuda
c_dataset = load_config(args.config_dataset_path)
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
if meta_data_eval is None:
wav_files = meta_data_train
else:
wav_files = meta_data_train + meta_data_eval
encoder_manager = SpeakerManager(
encoder_model_path=args.model_path,
encoder_config_path=args.config_path,
d_vectors_file_path=args.old_file,
use_cuda=use_cuda,
)
class_name_key = encoder_manager.encoder_config.class_name_key
# compute speaker embeddings
speaker_mapping = {}
for idx, wav_file in enumerate(tqdm(wav_files)):
if isinstance(wav_file, dict):
class_name = wav_file[class_name_key]
wav_file = wav_file["audio_file"]
if config_dataset_path is not None:
c_dataset = load_config(config_dataset_path)
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
else:
class_name = None
c_dataset = BaseDatasetConfig()
c_dataset.formatter = formatter_name
c_dataset.dataset_name = dataset_name
c_dataset.path = dataset_path
if meta_file_train is not None:
c_dataset.meta_file_train = meta_file_train
if meta_file_val is not None:
c_dataset.meta_file_val = meta_file_val
meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
wav_file_name = os.path.basename(wav_file)
if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
# get the embedding from the old file
embedd = encoder_manager.get_embedding_by_clip(wav_file_name)
if meta_data_eval is None:
samples = meta_data_train
else:
# extract the embedding
embedd = encoder_manager.compute_embedding_from_clip(wav_file)
samples = meta_data_train + meta_data_eval
# create speaker_mapping if target dataset is defined
speaker_mapping[wav_file_name] = {}
speaker_mapping[wav_file_name]["name"] = class_name
speaker_mapping[wav_file_name]["embedding"] = embedd
encoder_manager = SpeakerManager(
encoder_model_path=model_path,
encoder_config_path=config_path,
d_vectors_file_path=old_speakers_file,
use_cuda=use_cuda,
)
if speaker_mapping:
# save speaker_mapping if target dataset is defined
if os.path.isdir(args.output_path):
mapping_file_path = os.path.join(args.output_path, "speakers.pth")
class_name_key = encoder_manager.encoder_config.class_name_key
# compute speaker embeddings
if old_speakers_file is not None and old_append:
speaker_mapping = encoder_manager.embeddings
else:
mapping_file_path = args.output_path
speaker_mapping = {}
if os.path.dirname(mapping_file_path) != "":
os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
for fields in tqdm(samples):
class_name = fields[class_name_key]
audio_file = fields["audio_file"]
embedding_key = fields["audio_unique_name"]
save_file(speaker_mapping, mapping_file_path)
print("Speaker embeddings saved at:", mapping_file_path)
# Only update the speaker name when the embedding is already in the old file.
if embedding_key in speaker_mapping:
speaker_mapping[embedding_key]["name"] = class_name
continue
if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
# get the embedding from the old file
embedd = encoder_manager.get_embedding_by_clip(embedding_key)
else:
# extract the embedding
embedd = encoder_manager.compute_embedding_from_clip(audio_file)
# create speaker_mapping if target dataset is defined
speaker_mapping[embedding_key] = {}
speaker_mapping[embedding_key]["name"] = class_name
speaker_mapping[embedding_key]["embedding"] = embedd
if speaker_mapping:
# save speaker_mapping if target dataset is defined
if os.path.isdir(output_path):
mapping_file_path = os.path.join(output_path, "speakers.pth")
else:
mapping_file_path = output_path
if os.path.dirname(mapping_file_path) != "":
os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
save_file(speaker_mapping, mapping_file_path)
print("Speaker embeddings saved at:", mapping_file_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
"""
Example runs:
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
""",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument(
"--model_path",
type=str,
help="Path to model checkpoint file. It defaults to the released speaker encoder.",
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
)
parser.add_argument(
"--config_path",
type=str,
help="Path to model config file. It defaults to the released speaker encoder config.",
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
)
parser.add_argument(
"--config_dataset_path",
type=str,
help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
default=None,
)
parser.add_argument(
"--output_path",
type=str,
help="Path for output `pth` or `json` file.",
default="speakers.pth",
)
parser.add_argument(
"--old_file",
type=str,
help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
default=None,
)
parser.add_argument(
"--old_append",
help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
default=False,
action="store_true",
)
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
parser.add_argument(
"--formatter_name",
type=str,
help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--dataset_name",
type=str,
help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--dataset_path",
type=str,
help="Path to the dataset. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--meta_file_train",
type=str,
help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--meta_file_val",
type=str,
help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
default=None,
)
args = parser.parse_args()
compute_embeddings(
args.model_path,
args.config_path,
args.output_path,
old_speakers_file=args.old_file,
old_append=args.old_append,
config_dataset_path=args.config_dataset_path,
formatter_name=args.formatter_name,
dataset_name=args.dataset_name,
dataset_path=args.dataset_path,
meta_file_train=args.meta_file_train,
meta_file_val=args.meta_file_val,
disable_cuda=args.disable_cuda,
no_eval=args.no_eval,
)

View File

@ -10,7 +10,6 @@ from TTS.tts.utils.speakers import SpeakerManager
def compute_encoder_accuracy(dataset_items, encoder_manager):
class_name_key = encoder_manager.encoder_config.class_name_key
map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)

View File

@ -15,6 +15,7 @@ from TTS.tts.models import setup_model
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.audio.numpy_transforms import quantize
from TTS.utils.generic_utils import count_parameters
use_cuda = torch.cuda.is_available()
@ -37,7 +38,7 @@ def setup_loader(ap, r, verbose=False):
precompute_num_workers=0,
use_noise_augment=False,
verbose=verbose,
speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None,
speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
)
@ -159,12 +160,11 @@ def inference(
def extract_spectrograms(
data_loader, model, ap, output_path, quantized_wav=False, save_audio=False, debug=False, metada_name="metada.txt"
data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
):
model.eval()
export_metadata = []
for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
# format data
(
text_input,
@ -197,8 +197,8 @@ def extract_spectrograms(
_, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
# quantize and save wav
if quantized_wav:
wavq = ap.quantize(wav)
if quantize_bits > 0:
wavq = quantize(wav, quantize_bits)
np.save(wavq_path, wavq)
# save TTS mel
@ -264,7 +264,7 @@ def main(args): # pylint: disable=redefined-outer-name
model,
ap,
args.output_path,
quantized_wav=args.quantized,
quantize_bits=args.quantize_bits,
save_audio=args.save_audio,
debug=args.debug,
metada_name="metada.txt",
@ -278,7 +278,7 @@ if __name__ == "__main__":
parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
args = parser.parse_args()

View File

@ -7,30 +7,25 @@ from tqdm.contrib.concurrent import process_map
from TTS.config import load_config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
phonemizer = Gruut(language="en-us")
from TTS.tts.utils.text.phonemizers import Gruut
def compute_phonemes(item):
try:
text = item[0]
ph = phonemizer.phonemize(text).split("|")
except:
return []
return list(set(ph))
text = item["text"]
ph = phonemizer.phonemize(text).replace("|", "")
return set(list(ph))
def main():
# pylint: disable=W0601
global c
global c, phonemizer
# pylint: disable=bad-option-value
parser = argparse.ArgumentParser(
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
"""
Example runs:
python TTS/bin/find_unique_chars.py --config_path config.json
python TTS/bin/find_unique_phonemes.py --config_path config.json
""",
formatter_class=RawTextHelpFormatter,
)
@ -46,15 +41,24 @@ def main():
items = train_items + eval_items
print("Num items:", len(items))
is_lang_def = all(item["language"] for item in items)
language_list = [item["language"] for item in items]
is_lang_def = all(language_list)
if not c.phoneme_language or not is_lang_def:
raise ValueError("Phoneme language must be defined in config.")
if not language_list.count(language_list[0]) == len(language_list):
raise ValueError(
"Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
)
phonemizer = Gruut(language=language_list[0], keep_puncs=True)
phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
phones = []
for ph in phonemes:
phones.extend(ph)
phones = set(phones)
lower_phones = filter(lambda c: c.islower(), phones)
phones_force_lower = [c.lower() for c in phones]

View File

@ -1,50 +1,75 @@
import argparse
import glob
import multiprocessing
import os
import pathlib
import torch
from tqdm import tqdm
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
torch.set_num_threads(1)
def adjust_path_and_remove_silence(audio_path):
output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
# ignore if the file exists
if os.path.exists(output_path) and not args.force:
return output_path
return output_path, False
# create all directory structure
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
# remove the silence and save the audio
output_path = remove_silence(
output_path, is_speech = remove_silence(
model_and_utils,
audio_path,
output_path,
trim_just_beginning_and_end=args.trim_just_beginning_and_end,
use_cuda=args.use_cuda,
)
return output_path
return output_path, is_speech
def preprocess_audios():
files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
print("> Number of files: ", len(files))
if not args.force:
print("> Ignoring files that already exist in the output directory.")
print("> Ignoring files that already exist in the output idrectory.")
if args.trim_just_beginning_and_end:
print("> Trimming just the beginning and the end with nonspeech parts.")
else:
print("> Trimming all nonspeech parts.")
filtered_files = []
if files:
# create threads
# num_threads = multiprocessing.cpu_count()
# process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
for f in tqdm(files):
adjust_path_and_remove_silence(f)
if args.num_processes > 1:
with multiprocessing.Pool(processes=args.num_processes) as pool:
results = list(
tqdm(
pool.imap_unordered(adjust_path_and_remove_silence, files),
total=len(files),
desc="Processing audio files",
)
)
for output_path, is_speech in results:
if not is_speech:
filtered_files.append(output_path)
else:
for f in tqdm(files):
output_path, is_speech = adjust_path_and_remove_silence(f)
if not is_speech:
filtered_files.append(output_path)
# write files that do not have speech
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
for file in filtered_files:
f.write(str(file) + "\n")
else:
print("> No files Found !")
@ -53,10 +78,8 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
)
parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
parser.add_argument(
"-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
)
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
parser.add_argument(
"-g",
@ -79,7 +102,23 @@ if __name__ == "__main__":
default=False,
help="If True use cuda",
)
parser.add_argument(
"--use_onnx",
type=bool,
default=False,
help="If True use onnx",
)
parser.add_argument(
"--num_processes",
type=int,
default=1,
help="Number of processes to use",
)
args = parser.parse_args()
if args.output_dir == "":
args.output_dir = args.input_dir
# load the model and utils
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
preprocess_audios()

View File

@ -2,8 +2,8 @@ import argparse
import glob
import os
from argparse import RawTextHelpFormatter
from distutils.dir_util import copy_tree
from multiprocessing import Pool
from shutil import copytree
import librosa
import soundfile as sf
@ -16,8 +16,25 @@ def resample_file(func_args):
sf.write(filename, y, sr)
if __name__ == "__main__":
def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
if output_dir:
print("Recursively copying the input folder...")
copytree(input_dir, output_dir)
input_dir = output_dir
print("Resampling the audio files...")
audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
print(f"Found {len(audio_files)} files...")
audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
with Pool(processes=n_jobs) as p:
with tqdm(total=len(audio_files)) as pbar:
for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
pbar.update()
print("Done !")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Resample a folder recusively with librosa
Can be used in place or create a copy of the folder as an output.\n\n
@ -70,18 +87,4 @@ if __name__ == "__main__":
args = parser.parse_args()
if args.output_dir:
print("Recursively copying the input folder...")
copy_tree(args.input_dir, args.output_dir)
args.input_dir = args.output_dir
print("Resampling the audio files...")
audio_files = glob.glob(os.path.join(args.input_dir, f"**/*.{args.file_ext}"), recursive=True)
print(f"Found {len(audio_files)} files...")
audio_files = list(zip(audio_files, len(audio_files) * [args.output_sr]))
with Pool(processes=args.n_jobs) as p:
with tqdm(total=len(audio_files)) as pbar:
for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
pbar.update()
print("Done !")
resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)

View File

@ -2,14 +2,133 @@
# -*- coding: utf-8 -*-
import argparse
import contextlib
import sys
from argparse import RawTextHelpFormatter
# pylint: disable=redefined-outer-name, unused-argument
from pathlib import Path
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
description = """
Synthesize speech on command line.
You can either use your trained model or choose a model from the provided list.
If you don't specify any models, then it uses LJSpeech based English model.
#### Single Speaker Models
- List provided models:
```
$ tts --list_models
```
- Get model info (for both tts_models and vocoder_models):
- Query by type/name:
The model_info_by_name uses the name as it from the --list_models.
```
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
```
For example:
```
$ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
$ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
```
- Query by type/idx:
The model_query_idx uses the corresponding idx from --list_models.
```
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
```
For example:
```
$ tts --model_info_by_idx tts_models/3
```
- Query info for model info by full name:
```
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
```
- Run TTS with default models:
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav
```
- Run TTS and pipe out the generated TTS wav file data:
```
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
```
- Run a TTS model with its default vocoder model:
```
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
```
For example:
```
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
```
- Run with specific TTS and vocoder models from the list:
```
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
```
For example:
```
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
```
- Run your own TTS model (Using Griffin-Lim Vocoder):
```
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
```
- Run your own TTS and Vocoder models:
```
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
```
#### Multi-speaker Models
- List the available speakers and choose a <speaker_id> among them:
```
$ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
```
- Run the multi-speaker TTS model with the target speaker ID:
```
$ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
```
- Run your own multi-speaker TTS model:
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
```
### Voice Conversion Models
```
$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
```
"""
def str2bool(v):
@ -23,86 +142,6 @@ def str2bool(v):
def main():
description = """Synthesize speech on command line.
You can either use your trained model or choose a model from the provided list.
If you don't specify any models, then it uses LJSpeech based English model.
## Example Runs
### Single Speaker Models
- List provided models:
```
$ tts --list_models
```
- Query info for model info by idx:
```
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
```
- Query info for model info by full name:
```
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
```
- Run TTS with default models:
```
$ tts --text "Text for TTS"
```
- Run a TTS model with its default vocoder model:
```
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>
```
- Run with specific TTS and vocoder models from the list:
```
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --output_path
```
- Run your own TTS model (Using Griffin-Lim Vocoder):
```
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
```
- Run your own TTS and Vocoder models:
```
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
```
### Multi-speaker Models
- List the available speakers and choose as <speaker_id> among them:
```
$ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
```
- Run the multi-speaker TTS model with the target speaker ID:
```
$ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
```
- Run your own multi-speaker TTS model:
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
```
"""
# We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
# documentation in sync more easily.
parser = argparse.ArgumentParser(
description=description.replace(" ```\n", ""),
formatter_class=RawTextHelpFormatter,
@ -162,6 +201,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
help="Output wav file path.",
)
parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
parser.add_argument(
"--vocoder_path",
type=str,
@ -176,7 +216,15 @@ If you don't specify any models, then it uses LJSpeech based English model.
default=None,
)
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
parser.add_argument(
"--pipe_out",
help="stdout the generated TTS wav file for shell pipe.",
type=str2bool,
nargs="?",
const=True,
default=False,
)
# args for multi-speaker synthesis
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
@ -238,6 +286,34 @@ If you don't specify any models, then it uses LJSpeech based English model.
help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
default=None,
)
parser.add_argument(
"--progress_bar",
type=str2bool,
help="If true shows a progress bar for the model download. Defaults to True",
default=True,
)
# voice conversion args
parser.add_argument(
"--source_wav",
type=str,
default=None,
help="Original audio file to convert in the voice of the target_wav",
)
parser.add_argument(
"--target_wav",
type=str,
default=None,
help="Target audio file to convert in the voice of the source_wav",
)
parser.add_argument(
"--voice_dir",
type=str,
default=None,
help="Voice dir for tortoise model",
)
args = parser.parse_args()
# print the description if either text or list_models is not set
@ -249,118 +325,169 @@ If you don't specify any models, then it uses LJSpeech based English model.
args.reference_wav,
args.model_info_by_idx,
args.model_info_by_name,
args.source_wav,
args.target_wav,
]
if not any(check_args):
parser.parse_args(["-h"])
# load model manager
path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path)
pipe_out = sys.stdout if args.pipe_out else None
model_path = None
config_path = None
speakers_file_path = None
language_ids_file_path = None
vocoder_path = None
vocoder_config_path = None
encoder_path = None
encoder_config_path = None
with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
# Late-import to make things load faster
from TTS.api import TTS
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
# CASE1 #list : list pre-trained TTS models
if args.list_models:
manager.list_models()
sys.exit()
# load model manager
path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path, progress_bar=args.progress_bar)
api = TTS()
# CASE2 #info : model info of pre-trained TTS models
if args.model_info_by_idx:
model_query = args.model_info_by_idx
manager.model_info_by_idx(model_query)
sys.exit()
tts_path = None
tts_config_path = None
speakers_file_path = None
language_ids_file_path = None
vocoder_path = None
vocoder_config_path = None
encoder_path = None
encoder_config_path = None
vc_path = None
vc_config_path = None
model_dir = None
if args.model_info_by_name:
model_query_full_name = args.model_info_by_name
manager.model_info_by_full_name(model_query_full_name)
sys.exit()
# CASE1 #list : list pre-trained TTS models
if args.list_models:
manager.list_models()
sys.exit()
# CASE3: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
# CASE2 #info : model info for pre-trained TTS models
if args.model_info_by_idx:
model_query = args.model_info_by_idx
manager.model_info_by_idx(model_query)
sys.exit()
if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
if args.model_info_by_name:
model_query_full_name = args.model_info_by_name
manager.model_info_by_full_name(model_query_full_name)
sys.exit()
# CASE4: set custom model paths
if args.model_path is not None:
model_path = args.model_path
config_path = args.config_path
speakers_file_path = args.speakers_file_path
language_ids_file_path = args.language_ids_file_path
# CASE3: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)
# tts model
if model_item["model_type"] == "tts_models":
tts_path = model_path
tts_config_path = config_path
if "default_vocoder" in model_item:
args.vocoder_name = (
model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
)
if args.vocoder_path is not None:
vocoder_path = args.vocoder_path
vocoder_config_path = args.vocoder_config_path
# voice conversion model
if model_item["model_type"] == "voice_conversion_models":
vc_path = model_path
vc_config_path = config_path
if args.encoder_path is not None:
encoder_path = args.encoder_path
encoder_config_path = args.encoder_config_path
# tts model with multiple files to be loaded from the directory path
if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
model_dir = model_path
tts_path = None
tts_config_path = None
args.vocoder_name = None
# load models
synthesizer = Synthesizer(
model_path,
config_path,
speakers_file_path,
language_ids_file_path,
vocoder_path,
vocoder_config_path,
encoder_path,
encoder_config_path,
args.use_cuda,
)
# load vocoder
if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
# query speaker ids of a multi-speaker model.
if args.list_speaker_idxs:
print(
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
)
print(synthesizer.tts_model.speaker_manager.ids)
return
# CASE4: set custom model paths
if args.model_path is not None:
tts_path = args.model_path
tts_config_path = args.config_path
speakers_file_path = args.speakers_file_path
language_ids_file_path = args.language_ids_file_path
# query langauge ids of a multi-lingual model.
if args.list_language_idxs:
print(
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
)
print(synthesizer.tts_model.language_manager.ids)
return
if args.vocoder_path is not None:
vocoder_path = args.vocoder_path
vocoder_config_path = args.vocoder_config_path
# check the arguments against a multi-speaker model.
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
print(
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
)
return
if args.encoder_path is not None:
encoder_path = args.encoder_path
encoder_config_path = args.encoder_config_path
# RUN THE SYNTHESIS
if args.text:
print(" > Text: {}".format(args.text))
device = args.device
if args.use_cuda:
device = "cuda"
# kick it
wav = synthesizer.tts(
args.text,
args.speaker_idx,
args.language_idx,
args.speaker_wav,
reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav,
style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx,
)
# load models
synthesizer = Synthesizer(
tts_path,
tts_config_path,
speakers_file_path,
language_ids_file_path,
vocoder_path,
vocoder_config_path,
encoder_path,
encoder_config_path,
vc_path,
vc_config_path,
model_dir,
args.voice_dir,
).to(device)
# save the results
print(" > Saving output to {}".format(args.out_path))
synthesizer.save_wav(wav, args.out_path)
# query speaker ids of a multi-speaker model.
if args.list_speaker_idxs:
print(
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
)
print(synthesizer.tts_model.speaker_manager.name_to_id)
return
# query langauge ids of a multi-lingual model.
if args.list_language_idxs:
print(
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
)
print(synthesizer.tts_model.language_manager.name_to_id)
return
# check the arguments against a multi-speaker model.
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
print(
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
)
return
# RUN THE SYNTHESIS
if args.text:
print(" > Text: {}".format(args.text))
# kick it
if tts_path is not None:
wav = synthesizer.tts(
args.text,
speaker_name=args.speaker_idx,
language_name=args.language_idx,
speaker_wav=args.speaker_wav,
reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav,
style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx,
)
elif vc_path is not None:
wav = synthesizer.voice_conversion(
source_wav=args.source_wav,
target_wav=args.target_wav,
)
elif model_dir is not None:
wav = synthesizer.tts(
args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
)
# save the results
print(" > Saving output to {}".format(args.out_path))
synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
if __name__ == "__main__":

View File

@ -8,17 +8,17 @@ import traceback
import torch
from torch.utils.data import DataLoader
from trainer.io import copy_model_files, save_best_model, save_checkpoint
from trainer.torch import NoamLR
from trainer.trainer_utils import get_optimizer
from TTS.encoder.dataset import EncoderDataset
from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
from TTS.encoder.utils.generic_utils import setup_encoder_model
from TTS.encoder.utils.training import init_training
from TTS.encoder.utils.visual import plot_embeddings
from TTS.tts.datasets import load_tts_samples
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
from TTS.utils.io import copy_model_files
from TTS.utils.samplers import PerfectBatchSampler
from TTS.utils.training import check_update
@ -125,7 +125,7 @@ def evaluation(model, criterion, data_loader, global_step):
def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
model.train()
best_loss = float("inf")
best_loss = {"train_loss": None, "eval_loss": float("inf")}
avg_loader_time = 0
end_time = time.time()
for epoch in range(c.epochs):
@ -222,7 +222,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
if global_step % c.save_step == 0:
# save model
save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
save_checkpoint(
c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
)
end_time = time.time()
@ -245,7 +247,18 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
flush=True,
)
# save the best checkpoint
best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
best_loss = save_best_model(
{"train_loss": None, "eval_loss": eval_loss},
best_loss,
c,
model,
optimizer,
None,
global_step,
epoch,
OUT_PATH,
criterion=criterion.state_dict(),
)
model.train()
return best_loss, global_step
@ -276,7 +289,7 @@ def main(args): # pylint: disable=redefined-outer-name
if c.loss == "softmaxproto" and c.model != "speaker_encoder":
c.map_classid_to_classname = map_classid_to_classname
copy_model_files(c, OUT_PATH)
copy_model_files(c, OUT_PATH, new_fields={})
if args.restore_path:
criterion, args.restore_step = model.load_checkpoint(

View File

@ -16,12 +16,9 @@ def read_json_with_comments(json_path):
# fallback to json
with fsspec.open(json_path, "r", encoding="utf-8") as f:
input_str = f.read()
# handle comments
input_str = re.sub(r"\\\n", "", input_str)
input_str = re.sub(r"//.*\n", "\n", input_str)
data = json.loads(input_str)
return data
# handle comments but not urls with //
input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
return json.loads(input_str)
def register_config(model_name: str) -> Coqpit:
"""Find the right config for the given model name.
@ -37,7 +34,13 @@ def register_config(model_name: str) -> Coqpit:
"""
config_class = None
config_name = model_name + "_config"
paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"]
# TODO: fix this
if model_name == "xtts":
from TTS.tts.configs.xtts_config import XttsConfig
config_class = XttsConfig
paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
for path in paths:
try:
config_class = find_module(path, config_name)

View File

@ -62,7 +62,7 @@ class BaseAudioConfig(Coqpit):
Maximum frequency of the F0 frames. Defaults to ```640```.
pitch_fmin (float, optional):
Minimum frequency of the F0 frames. Defaults to ```0```.
Minimum frequency of the F0 frames. Defaults to ```1```.
trim_db (int):
Silence threshold used for silence trimming. Defaults to 45.
@ -144,7 +144,7 @@ class BaseAudioConfig(Coqpit):
do_amp_to_db_mel: bool = True
# f0 params
pitch_fmax: float = 640.0
pitch_fmin: float = 0.0
pitch_fmin: float = 1.0
# normalization params
signal_norm: bool = True
min_level_db: int = -100
@ -193,21 +193,27 @@ class BaseDatasetConfig(Coqpit):
"""Base config for TTS datasets.
Args:
name (str):
Dataset name that defines the preprocessor in use. Defaults to None.
formatter (str):
Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
dataset_name (str):
Unique name for the dataset. Defaults to `""`.
path (str):
Root path to the dataset files. Defaults to None.
Root path to the dataset files. Defaults to `""`.
meta_file_train (str):
Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
Defaults to None.
Defaults to `""`.
ignored_speakers (List):
List of speakers IDs that are not used at the training. Default None.
language (str):
Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to None.
Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
phonemizer (str):
Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
meta_file_val (str):
Name of the dataset meta file that defines the instances used at validation.
@ -217,11 +223,13 @@ class BaseDatasetConfig(Coqpit):
train the duration predictor.
"""
name: str = ""
formatter: str = ""
dataset_name: str = ""
path: str = ""
meta_file_train: str = ""
ignored_speakers: List[str] = None
language: str = ""
phonemizer: str = ""
meta_file_val: str = ""
meta_file_attn_mask: str = ""
@ -230,7 +238,7 @@ class BaseDatasetConfig(Coqpit):
):
"""Check config fields"""
c = asdict(self)
check_argument("name", c, restricted=True)
check_argument("formatter", c, restricted=True)
check_argument("path", c, restricted=True)
check_argument("meta_file_train", c, restricted=True)
check_argument("meta_file_val", c, restricted=False)

View File

@ -0,0 +1,2 @@
faster_whisper==0.9.0
gradio==4.7.1

View File

@ -0,0 +1,160 @@
import os
import gc
import torchaudio
import pandas
from faster_whisper import WhisperModel
from glob import glob
from tqdm import tqdm
import torch
import torchaudio
# torch.set_num_threads(1)
from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
torch.set_num_threads(16)
import os
audio_types = (".wav", ".mp3", ".flac")
def list_audios(basePath, contains=None):
# return the set of files that are valid
return list_files(basePath, validExts=audio_types, contains=contains)
def list_files(basePath, validExts=None, contains=None):
# loop over the directory structure
for (rootDir, dirNames, filenames) in os.walk(basePath):
# loop over the filenames in the current directory
for filename in filenames:
# if the contains string is not none and the filename does not contain
# the supplied string, then ignore the file
if contains is not None and filename.find(contains) == -1:
continue
# determine the file extension of the current file
ext = filename[filename.rfind("."):].lower()
# check to see if the file is an audio and should be processed
if validExts is None or ext.endswith(validExts):
# construct the path to the audio and yield it
audioPath = os.path.join(rootDir, filename)
yield audioPath
def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
audio_total_size = 0
# make sure that ooutput file exists
os.makedirs(out_path, exist_ok=True)
# Loading Whisper
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Loading Whisper Model!")
asr_model = WhisperModel("large-v2", device=device, compute_type="float16")
metadata = {"audio_file": [], "text": [], "speaker_name": []}
if gradio_progress is not None:
tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
else:
tqdm_object = tqdm(audio_files)
for audio_path in tqdm_object:
wav, sr = torchaudio.load(audio_path)
# stereo to mono if needed
if wav.size(0) != 1:
wav = torch.mean(wav, dim=0, keepdim=True)
wav = wav.squeeze()
audio_total_size += (wav.size(-1) / sr)
segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
segments = list(segments)
i = 0
sentence = ""
sentence_start = None
first_word = True
# added all segments words in a unique list
words_list = []
for _, segment in enumerate(segments):
words = list(segment.words)
words_list.extend(words)
# process each word
for word_idx, word in enumerate(words_list):
if first_word:
sentence_start = word.start
# If it is the first sentence, add buffer or get the begining of the file
if word_idx == 0:
sentence_start = max(sentence_start - buffer, 0) # Add buffer to the sentence start
else:
# get previous sentence end
previous_word_end = words_list[word_idx - 1].end
# add buffer or get the silence midle between the previous sentence and the current one
sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)
sentence = word.word
first_word = False
else:
sentence += word.word
if word.word[-1] in ["!", ".", "?"]:
sentence = sentence[1:]
# Expand number and abbreviations plus normalization
sentence = multilingual_cleaners(sentence, target_language)
audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))
audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
# Check for the next word's existence
if word_idx + 1 < len(words_list):
next_word_start = words_list[word_idx + 1].start
else:
# If don't have more words it means that it is the last sentence then use the audio len as next word start
next_word_start = (wav.shape[0] - 1) / sr
# Average the current word end and next word start
word_end = min((word.end + next_word_start) / 2, word.end + buffer)
absoulte_path = os.path.join(out_path, audio_file)
os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
i += 1
first_word = True
audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
# if the audio is too short ignore it (i.e < 0.33 seconds)
if audio.size(-1) >= sr/3:
torchaudio.save(absoulte_path,
audio,
sr
)
else:
continue
metadata["audio_file"].append(audio_file)
metadata["text"].append(sentence)
metadata["speaker_name"].append(speaker_name)
df = pandas.DataFrame(metadata)
df = df.sample(frac=1)
num_val_samples = int(len(df)*eval_percentage)
df_eval = df[:num_val_samples]
df_train = df[num_val_samples:]
df_train = df_train.sort_values('audio_file')
train_metadata_path = os.path.join(out_path, "metadata_train.csv")
df_train.to_csv(train_metadata_path, sep="|", index=False)
eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
df_eval = df_eval.sort_values('audio_file')
df_eval.to_csv(eval_metadata_path, sep="|", index=False)
# deallocate VRAM and RAM
del asr_model, df_train, df_eval, df, metadata
gc.collect()
return train_metadata_path, eval_metadata_path, audio_total_size

View File

@ -0,0 +1,172 @@
import os
import gc
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
from TTS.utils.manage import ModelManager
def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995):
# Logging parameters
RUN_NAME = "GPT_XTTS_FT"
PROJECT_NAME = "XTTS_trainer"
DASHBOARD_LOGGER = "tensorboard"
LOGGER_URI = None
# Set here the path that the checkpoints will be saved. Default: ./run/training/
OUT_PATH = os.path.join(output_path, "run", "training")
# Training Parameters
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
START_WITH_EVAL = False # if True it will star with evaluation
BATCH_SIZE = batch_size # set here the batch size
GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps
# Define here the dataset that you want to use for the fine-tuning on.
config_dataset = BaseDatasetConfig(
formatter="coqui",
dataset_name="ft_dataset",
path=os.path.dirname(train_csv),
meta_file_train=train_csv,
meta_file_val=eval_csv,
language=language,
)
# Add here the configs of the datasets
DATASETS_CONFIG_LIST = [config_dataset]
# Define the path where XTTS v2.0.1 files will be downloaded
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
# DVAE files
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
# Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
# download DVAE files if needed
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
print(" > Downloading DVAE files!")
ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
# Download XTTS v2.0 checkpoint if needed
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK)) # config.json file
# download XTTS v2.0 files if needed
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
print(" > Downloading XTTS v2.0 files!")
ModelManager._download_model_files(
[TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
)
# init args and config
model_args = GPTArgs(
max_conditioning_length=132300, # 6 secs
min_conditioning_length=66150, # 3 secs
debug_loading_failures=False,
max_wav_length=max_audio_length, # ~11.6 seconds
max_text_length=200,
mel_norm_file=MEL_NORM_FILE,
dvae_checkpoint=DVAE_CHECKPOINT,
xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
tokenizer_file=TOKENIZER_FILE,
gpt_num_audio_tokens=1026,
gpt_start_audio_token=1024,
gpt_stop_audio_token=1025,
gpt_use_masking_gt_prompt_approach=True,
gpt_use_perceiver_resampler=True,
)
# define audio config
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
# training parameters config
config = GPTTrainerConfig(
epochs=num_epochs,
output_path=OUT_PATH,
model_args=model_args,
run_name=RUN_NAME,
project_name=PROJECT_NAME,
run_description="""
GPT XTTS training
""",
dashboard_logger=DASHBOARD_LOGGER,
logger_uri=LOGGER_URI,
audio=audio_config,
batch_size=BATCH_SIZE,
batch_group_size=48,
eval_batch_size=BATCH_SIZE,
num_loader_workers=8,
eval_split_max_size=256,
print_step=50,
plot_step=100,
log_model_step=100,
save_step=1000,
save_n_checkpoints=1,
save_checkpoints=True,
# target_loss="loss",
print_eval=False,
# Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
optimizer="AdamW",
optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
lr=5e-06, # learning rate
lr_scheduler="MultiStepLR",
# it was adjusted accordly for the new step scheme
lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
test_sentences=[],
)
# init the model from config
model = GPTTrainer.init_from_config(config)
# load training samples
train_samples, eval_samples = load_tts_samples(
DATASETS_CONFIG_LIST,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(
restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
skip_train_epoch=False,
start_with_eval=START_WITH_EVAL,
grad_accum_steps=GRAD_ACUMM_STEPS,
),
config,
output_path=OUT_PATH,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
)
trainer.fit()
# get the longest text audio file to use as speaker reference
samples_len = [len(item["text"].split(" ")) for item in train_samples]
longest_text_idx = samples_len.index(max(samples_len))
speaker_ref = train_samples[longest_text_idx]["audio_file"]
trainer_out_path = trainer.output_path
# deallocate VRAM and RAM
del model, trainer, train_samples, eval_samples
gc.collect()
return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref

View File

@ -0,0 +1,415 @@
import argparse
import os
import sys
import tempfile
import gradio as gr
import librosa.display
import numpy as np
import os
import torch
import torchaudio
import traceback
from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
def clear_gpu_cache():
# clear the GPU cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
XTTS_MODEL = None
def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
global XTTS_MODEL
clear_gpu_cache()
if not xtts_checkpoint or not xtts_config or not xtts_vocab:
return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL = Xtts.init_from_config(config)
print("Loading XTTS model! ")
XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
if torch.cuda.is_available():
XTTS_MODEL.cuda()
print("Model Loaded!")
return "Model Loaded!"
def run_tts(lang, tts_text, speaker_audio_file):
if XTTS_MODEL is None or not speaker_audio_file:
return "You need to run the previous step to load the model !!", None, None
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
out = XTTS_MODEL.inference(
text=tts_text,
language=lang,
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
length_penalty=XTTS_MODEL.config.length_penalty,
repetition_penalty=XTTS_MODEL.config.repetition_penalty,
top_k=XTTS_MODEL.config.top_k,
top_p=XTTS_MODEL.config.top_p,
)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
out_path = fp.name
torchaudio.save(out_path, out["wav"], 24000)
return "Speech generated !", out_path, speaker_audio_file
# define a logger to redirect
class Logger:
def __init__(self, filename="log.out"):
self.log_file = filename
self.terminal = sys.stdout
self.log = open(self.log_file, "w")
def write(self, message):
self.terminal.write(message)
self.log.write(message)
def flush(self):
self.terminal.flush()
self.log.flush()
def isatty(self):
return False
# redirect stdout and stderr to a file
sys.stdout = Logger()
sys.stderr = sys.stdout
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
import logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(sys.stdout)
]
)
def read_logs():
sys.stdout.flush()
with open(sys.stdout.log_file, "r") as f:
return f.read()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""XTTS fine-tuning demo\n\n"""
"""
Example runs:
python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port
""",
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"--port",
type=int,
help="Port to run the gradio demo. Default: 5003",
default=5003,
)
parser.add_argument(
"--out_path",
type=str,
help="Output path (where data and checkpoints will be saved) Default: /tmp/xtts_ft/",
default="/tmp/xtts_ft/",
)
parser.add_argument(
"--num_epochs",
type=int,
help="Number of epochs to train. Default: 10",
default=10,
)
parser.add_argument(
"--batch_size",
type=int,
help="Batch size. Default: 4",
default=4,
)
parser.add_argument(
"--grad_acumm",
type=int,
help="Grad accumulation steps. Default: 1",
default=1,
)
parser.add_argument(
"--max_audio_length",
type=int,
help="Max permitted audio size in seconds. Default: 11",
default=11,
)
args = parser.parse_args()
with gr.Blocks() as demo:
with gr.Tab("1 - Data processing"):
out_path = gr.Textbox(
label="Output path (where data and checkpoints will be saved):",
value=args.out_path,
)
# upload_file = gr.Audio(
# sources="upload",
# label="Select here the audio files that you want to use for XTTS trainining !",
# type="filepath",
# )
upload_file = gr.File(
file_count="multiple",
label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)",
)
lang = gr.Dropdown(
label="Dataset Language",
value="en",
choices=[
"en",
"es",
"fr",
"de",
"it",
"pt",
"pl",
"tr",
"ru",
"nl",
"cs",
"ar",
"zh",
"hu",
"ko",
"ja"
],
)
progress_data = gr.Label(
label="Progress:"
)
logs = gr.Textbox(
label="Logs:",
interactive=False,
)
demo.load(read_logs, None, logs, every=1)
prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)):
clear_gpu_cache()
out_path = os.path.join(out_path, "dataset")
os.makedirs(out_path, exist_ok=True)
if audio_path is None:
return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", ""
else:
try:
train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress)
except:
traceback.print_exc()
error = traceback.format_exc()
return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", ""
clear_gpu_cache()
# if audio total len is less than 2 minutes raise an error
if audio_total_size < 120:
message = "The sum of the duration of the audios that you provided should be at least 2 minutes!"
print(message)
return message, "", ""
print("Dataset Processed!")
return "Dataset Processed!", train_meta, eval_meta
with gr.Tab("2 - Fine-tuning XTTS Encoder"):
train_csv = gr.Textbox(
label="Train CSV:",
)
eval_csv = gr.Textbox(
label="Eval CSV:",
)
num_epochs = gr.Slider(
label="Number of epochs:",
minimum=1,
maximum=100,
step=1,
value=args.num_epochs,
)
batch_size = gr.Slider(
label="Batch size:",
minimum=2,
maximum=512,
step=1,
value=args.batch_size,
)
grad_acumm = gr.Slider(
label="Grad accumulation steps:",
minimum=2,
maximum=128,
step=1,
value=args.grad_acumm,
)
max_audio_length = gr.Slider(
label="Max permitted audio size in seconds:",
minimum=2,
maximum=20,
step=1,
value=args.max_audio_length,
)
progress_train = gr.Label(
label="Progress:"
)
logs_tts_train = gr.Textbox(
label="Logs:",
interactive=False,
)
demo.load(read_logs, None, logs_tts_train, every=1)
train_btn = gr.Button(value="Step 2 - Run the training")
def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length):
clear_gpu_cache()
if not train_csv or not eval_csv:
return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", ""
try:
# convert seconds to waveform frames
max_audio_length = int(max_audio_length * 22050)
config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length)
except:
traceback.print_exc()
error = traceback.format_exc()
return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", ""
# copy original files to avoid parameters changes issues
os.system(f"cp {config_path} {exp_path}")
os.system(f"cp {vocab_file} {exp_path}")
ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
print("Model training done!")
clear_gpu_cache()
return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav
with gr.Tab("3 - Inference"):
with gr.Row():
with gr.Column() as col1:
xtts_checkpoint = gr.Textbox(
label="XTTS checkpoint path:",
value="",
)
xtts_config = gr.Textbox(
label="XTTS config path:",
value="",
)
xtts_vocab = gr.Textbox(
label="XTTS vocab path:",
value="",
)
progress_load = gr.Label(
label="Progress:"
)
load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
with gr.Column() as col2:
speaker_reference_audio = gr.Textbox(
label="Speaker reference audio:",
value="",
)
tts_language = gr.Dropdown(
label="Language",
value="en",
choices=[
"en",
"es",
"fr",
"de",
"it",
"pt",
"pl",
"tr",
"ru",
"nl",
"cs",
"ar",
"zh",
"hu",
"ko",
"ja",
]
)
tts_text = gr.Textbox(
label="Input Text.",
value="This model sounds really good and above all, it's reasonably fast.",
)
tts_btn = gr.Button(value="Step 4 - Inference")
with gr.Column() as col3:
progress_gen = gr.Label(
label="Progress:"
)
tts_output_audio = gr.Audio(label="Generated Audio.")
reference_audio = gr.Audio(label="Reference audio used.")
prompt_compute_btn.click(
fn=preprocess_dataset,
inputs=[
upload_file,
lang,
out_path,
],
outputs=[
progress_data,
train_csv,
eval_csv,
],
)
train_btn.click(
fn=train_model,
inputs=[
lang,
train_csv,
eval_csv,
num_epochs,
batch_size,
grad_acumm,
out_path,
max_audio_length,
],
outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio],
)
load_btn.click(
fn=load_model,
inputs=[
xtts_checkpoint,
xtts_config,
xtts_vocab
],
outputs=[progress_load],
)
tts_btn.click(
fn=run_tts,
inputs=[
tts_language,
tts_text,
speaker_reference_audio,
],
outputs=[progress_gen, tts_output_audio, reference_audio],
)
demo.launch(
share=True,
debug=False,
server_port=args.port,
server_name="0.0.0.0"
)

View File

@ -107,11 +107,18 @@ class BaseEncoder(nn.Module):
return criterion
def load_checkpoint(
self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
self,
config: Coqpit,
checkpoint_path: str,
eval: bool = False,
use_cuda: bool = False,
criterion=None,
cache=False,
):
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
try:
self.load_state_dict(state["model"])
print(" > Model fully restored. ")
except (KeyError, RuntimeError) as error:
# If eval raise the error
if eval:

View File

@ -161,16 +161,14 @@ class ResNetSpeakerEncoder(BaseEncoder):
Shapes:
- x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
"""
with torch.no_grad():
with torch.cuda.amp.autocast(enabled=False):
x.squeeze_(1)
# if you torch spec compute it otherwise use the mel spec computed by the AP
if self.use_torch_spec:
x = self.torch_spec(x)
x.squeeze_(1)
# if you torch spec compute it otherwise use the mel spec computed by the AP
if self.use_torch_spec:
x = self.torch_spec(x)
if self.log_input:
x = (x + 1e-6).log()
x = self.instancenorm(x).unsqueeze(1)
if self.log_input:
x = (x + 1e-6).log()
x = self.instancenorm(x).unsqueeze(1)
x = self.conv1(x)
x = self.relu(x)

View File

@ -1,20 +1,16 @@
import datetime
import glob
import os
import random
import re
import numpy as np
from scipy import signal
from TTS.encoder.models.lstm import LSTMSpeakerEncoder
from TTS.encoder.models.resnet import ResNetSpeakerEncoder
from TTS.utils.io import save_fsspec
class AugmentWAV(object):
def __init__(self, ap, augmentation_config):
self.ap = ap
self.use_additive_noise = False
@ -67,7 +63,6 @@ class AugmentWAV(object):
self.global_noise_list.append("RIR_AUG")
def additive_noise(self, noise_type, audio):
clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
noise_list = random.sample(
@ -120,11 +115,6 @@ class AugmentWAV(object):
return self.additive_noise(noise_type, audio)
def to_camel(text):
text = text.capitalize()
return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
def setup_encoder_model(config: "Coqpit"):
if config.model_params["model_name"].lower() == "lstm":
model = LSTMSpeakerEncoder(
@ -144,41 +134,3 @@ def setup_encoder_model(config: "Coqpit"):
audio_config=config.audio,
)
return model
def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
checkpoint_path = "checkpoint_{}.pth".format(current_step)
checkpoint_path = os.path.join(out_path, checkpoint_path)
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
new_state_dict = model.state_dict()
state = {
"model": new_state_dict,
"optimizer": optimizer.state_dict() if optimizer is not None else None,
"criterion": criterion.state_dict(),
"step": current_step,
"epoch": epoch,
"loss": model_loss,
"date": datetime.date.today().strftime("%B %d, %Y"),
}
save_fsspec(state, checkpoint_path)
def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
if model_loss < best_loss:
new_state_dict = model.state_dict()
state = {
"model": new_state_dict,
"optimizer": optimizer.state_dict(),
"criterion": criterion.state_dict(),
"step": current_step,
"epoch": epoch,
"loss": model_loss,
"date": datetime.date.today().strftime("%B %d, %Y"),
}
best_loss = model_loss
bestmodel_path = "best_model.pth"
bestmodel_path = os.path.join(out_path, bestmodel_path)
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
save_fsspec(state, bestmodel_path)
return best_loss

View File

@ -1,38 +0,0 @@
import datetime
import os
from TTS.utils.io import save_fsspec
def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
checkpoint_path = "checkpoint_{}.pth".format(current_step)
checkpoint_path = os.path.join(out_path, checkpoint_path)
print(" | | > Checkpoint saving : {}".format(checkpoint_path))
new_state_dict = model.state_dict()
state = {
"model": new_state_dict,
"optimizer": optimizer.state_dict() if optimizer is not None else None,
"step": current_step,
"loss": model_loss,
"date": datetime.date.today().strftime("%B %d, %Y"),
}
save_fsspec(state, checkpoint_path)
def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
if model_loss < best_loss:
new_state_dict = model.state_dict()
state = {
"model": new_state_dict,
"optimizer": optimizer.state_dict(),
"step": current_step,
"loss": model_loss,
"date": datetime.date.today().strftime("%B %d, %Y"),
}
best_loss = model_loss
bestmodel_path = "best_model.pth"
bestmodel_path = os.path.join(out_path, bestmodel_path)
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
save_fsspec(state, bestmodel_path)
return best_loss

View File

@ -3,13 +3,13 @@ from dataclasses import dataclass, field
from coqpit import Coqpit
from trainer import TrainerArgs, get_last_checkpoint
from trainer.io import copy_model_files
from trainer.logging import logger_factory
from trainer.logging.console_logger import ConsoleLogger
from TTS.config import load_config, register_config
from TTS.tts.utils.text.characters import parse_symbols
from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
from TTS.utils.io import copy_model_files
@dataclass

View File

@ -23,7 +23,7 @@ colormap = (
[0, 0, 0],
[183, 183, 183],
],
dtype=np.float,
dtype=float,
)
/ 255
)

View File

@ -44,13 +44,16 @@ class BaseTrainerModel(TrainerModel):
return outputs_dict
@abstractmethod
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
def load_checkpoint(
self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
) -> None:
"""Load a model checkpoint gile and get ready for training or inference.
Args:
config (Coqpit): Model configuration.
checkpoint_path (str): Path to the model checkpoint file.
eval (bool, optional): If true, init model for inference else for training. Defaults to False.
strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
"""
...

View File

@ -5,9 +5,11 @@ import json
import os
import sys
from pathlib import Path
from threading import Lock
from typing import Union
from urllib.parse import parse_qs
from flask import Flask, render_template, request, send_file
from flask import Flask, render_template, render_template_string, request, send_file
from TTS.config import load_config
from TTS.utils.manage import ModelManager
@ -114,8 +116,13 @@ synthesizer = Synthesizer(
use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
)
speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and (
synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None
)
language_manager = getattr(synthesizer.tts_model, "language_manager", None)
# TODO: set this from SpeakerManager
use_gst = synthesizer.tts_config.get("use_gst", False)
app = Flask(__name__)
@ -146,18 +153,28 @@ def index():
"index.html",
show_details=args.show_details,
use_multi_speaker=use_multi_speaker,
speaker_ids=speaker_manager.ids if speaker_manager is not None else None,
use_multi_language=use_multi_language,
speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
language_ids=language_manager.name_to_id if language_manager is not None else None,
use_gst=use_gst,
)
@app.route("/details")
def details():
model_config = load_config(args.tts_config)
if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
vocoder_config = load_config(args.vocoder_config)
if args.config_path is not None and os.path.isfile(args.config_path):
model_config = load_config(args.config_path)
else:
vocoder_config = None
if args.model_name is not None:
model_config = load_config(config_path)
if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
vocoder_config = load_config(args.vocoder_config_path)
else:
if args.vocoder_name is not None:
vocoder_config = load_config(vocoder_config_path)
else:
vocoder_config = None
return render_template(
"details.html",
@ -168,17 +185,68 @@ def details():
)
@app.route("/api/tts", methods=["GET"])
lock = Lock()
@app.route("/api/tts", methods=["GET", "POST"])
def tts():
text = request.args.get("text")
speaker_idx = request.args.get("speaker_id", "")
style_wav = request.args.get("style_wav", "")
style_wav = style_wav_uri_to_dict(style_wav)
print(" > Model input: {}".format(text))
print(" > Speaker Idx: {}".format(speaker_idx))
wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
out = io.BytesIO()
synthesizer.save_wav(wavs, out)
with lock:
text = request.headers.get("text") or request.values.get("text", "")
speaker_idx = request.headers.get("speaker-id") or request.values.get("speaker_id", "")
language_idx = request.headers.get("language-id") or request.values.get("language_id", "")
style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
style_wav = style_wav_uri_to_dict(style_wav)
print(f" > Model input: {text}")
print(f" > Speaker Idx: {speaker_idx}")
print(f" > Language Idx: {language_idx}")
wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
out = io.BytesIO()
synthesizer.save_wav(wavs, out)
return send_file(out, mimetype="audio/wav")
# Basic MaryTTS compatibility layer
@app.route("/locales", methods=["GET"])
def mary_tts_api_locales():
"""MaryTTS-compatible /locales endpoint"""
# NOTE: We currently assume there is only one model active at the same time
if args.model_name is not None:
model_details = args.model_name.split("/")
else:
model_details = ["", "en", "", "default"]
return render_template_string("{{ locale }}\n", locale=model_details[1])
@app.route("/voices", methods=["GET"])
def mary_tts_api_voices():
"""MaryTTS-compatible /voices endpoint"""
# NOTE: We currently assume there is only one model active at the same time
if args.model_name is not None:
model_details = args.model_name.split("/")
else:
model_details = ["", "en", "", "default"]
return render_template_string(
"{{ name }} {{ locale }} {{ gender }}\n", name=model_details[3], locale=model_details[1], gender="u"
)
@app.route("/process", methods=["GET", "POST"])
def mary_tts_api_process():
"""MaryTTS-compatible /process endpoint"""
with lock:
if request.method == "POST":
data = parse_qs(request.get_data(as_text=True))
# NOTE: we ignore param. LOCALE and VOICE for now since we have only one active model
text = data.get("INPUT_TEXT", [""])[0]
else:
text = request.args.get("INPUT_TEXT", "")
print(f" > Model input: {text}")
wavs = synthesizer.tts(text)
out = io.BytesIO()
synthesizer.save_wav(wavs, out)
return send_file(out, mimetype="audio/wav")

View File

@ -65,7 +65,7 @@
</ul>
{%if use_gst%}
<input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path ot wav).." size=45
<input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path to wav).." size=45
type="text" name="style_wav">
{%endif%}
@ -81,6 +81,16 @@
</select><br /><br />
{%endif%}
{%if use_multi_language%}
Choose a language:
<select id="language_id" name=language_id method="GET" action="/">
{% for language_id in language_ids %}
<option value="{{language_id}}" SELECTED>{{language_id}}</option>"
{% endfor %}
</select><br /><br />
{%endif%}
{%if show_details%}
<button id="details-button" onclick="location.href = 'details'" name="model-details">Model
Details</button><br /><br />
@ -106,11 +116,12 @@
const text = q('#text').value
const speaker_id = getTextValue('#speaker_id')
const style_wav = getTextValue('#style_wav')
const language_id = getTextValue('#language_id')
if (text) {
q('#message').textContent = 'Synthesizing...'
q('#speak-button').disabled = true
q('#audio').hidden = true
synthesize(text, speaker_id, style_wav)
synthesize(text, speaker_id, style_wav, language_id)
}
e.preventDefault()
return false
@ -121,8 +132,8 @@
do_tts(e)
}
})
function synthesize(text, speaker_id = "", style_wav = "") {
fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}`, { cache: 'no-cache' })
function synthesize(text, speaker_id = "", style_wav = "", language_id = "") {
fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}&language_id=${encodeURIComponent(language_id)}`, { cache: 'no-cache' })
.then(function (res) {
if (!res.ok) throw Error(res.statusText)
return res.blob()

View File

@ -0,0 +1,105 @@
import os
from dataclasses import dataclass, field
from typing import Dict
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.layers.bark.model import GPTConfig
from TTS.tts.layers.bark.model_fine import FineGPTConfig
from TTS.tts.models.bark import BarkAudioConfig
from TTS.utils.generic_utils import get_user_data_dir
@dataclass
class BarkConfig(BaseTTSConfig):
"""Bark TTS configuration
Args:
model (str): model name that registers the model.
audio (BarkAudioConfig): audio configuration. Defaults to BarkAudioConfig().
num_chars (int): number of characters in the alphabet. Defaults to 0.
semantic_config (GPTConfig): semantic configuration. Defaults to GPTConfig().
fine_config (FineGPTConfig): fine configuration. Defaults to FineGPTConfig().
coarse_config (GPTConfig): coarse configuration. Defaults to GPTConfig().
CONTEXT_WINDOW_SIZE (int): GPT context window size. Defaults to 1024.
SEMANTIC_RATE_HZ (float): semantic tokens rate in Hz. Defaults to 49.9.
SEMANTIC_VOCAB_SIZE (int): semantic vocabulary size. Defaults to 10_000.
CODEBOOK_SIZE (int): encodec codebook size. Defaults to 1024.
N_COARSE_CODEBOOKS (int): number of coarse codebooks. Defaults to 2.
N_FINE_CODEBOOKS (int): number of fine codebooks. Defaults to 8.
COARSE_RATE_HZ (int): coarse tokens rate in Hz. Defaults to 75.
SAMPLE_RATE (int): sample rate. Defaults to 24_000.
USE_SMALLER_MODELS (bool): use smaller models. Defaults to False.
TEXT_ENCODING_OFFSET (int): text encoding offset. Defaults to 10_048.
SEMANTIC_PAD_TOKEN (int): semantic pad token. Defaults to 10_000.
TEXT_PAD_TOKEN ([type]): text pad token. Defaults to 10_048.
TEXT_EOS_TOKEN ([type]): text end of sentence token. Defaults to 10_049.
TEXT_SOS_TOKEN ([type]): text start of sentence token. Defaults to 10_050.
SEMANTIC_INFER_TOKEN (int): semantic infer token. Defaults to 10_051.
COARSE_SEMANTIC_PAD_TOKEN (int): coarse semantic pad token. Defaults to 12_048.
COARSE_INFER_TOKEN (int): coarse infer token. Defaults to 12_050.
REMOTE_BASE_URL ([type]): remote base url. Defaults to "https://huggingface.co/erogol/bark/tree".
REMOTE_MODEL_PATHS (Dict): remote model paths. Defaults to None.
LOCAL_MODEL_PATHS (Dict): local model paths. Defaults to None.
SMALL_REMOTE_MODEL_PATHS (Dict): small remote model paths. Defaults to None.
CACHE_DIR (str): local cache directory. Defaults to get_user_data_dir().
DEF_SPEAKER_DIR (str): default speaker directory to stoke speaker values for voice cloning. Defaults to get_user_data_dir().
"""
model: str = "bark"
audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
num_chars: int = 0
semantic_config: GPTConfig = field(default_factory=GPTConfig)
fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
coarse_config: GPTConfig = field(default_factory=GPTConfig)
CONTEXT_WINDOW_SIZE: int = 1024
SEMANTIC_RATE_HZ: float = 49.9
SEMANTIC_VOCAB_SIZE: int = 10_000
CODEBOOK_SIZE: int = 1024
N_COARSE_CODEBOOKS: int = 2
N_FINE_CODEBOOKS: int = 8
COARSE_RATE_HZ: int = 75
SAMPLE_RATE: int = 24_000
USE_SMALLER_MODELS: bool = False
TEXT_ENCODING_OFFSET: int = 10_048
SEMANTIC_PAD_TOKEN: int = 10_000
TEXT_PAD_TOKEN: int = 129_595
SEMANTIC_INFER_TOKEN: int = 129_599
COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
COARSE_INFER_TOKEN: int = 12_050
REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
REMOTE_MODEL_PATHS: Dict = None
LOCAL_MODEL_PATHS: Dict = None
SMALL_REMOTE_MODEL_PATHS: Dict = None
CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0"))
DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers"))
def __post_init__(self):
self.REMOTE_MODEL_PATHS = {
"text": {
"path": os.path.join(self.REMOTE_BASE_URL, "text_2.pt"),
"checksum": "54afa89d65e318d4f5f80e8e8799026a",
},
"coarse": {
"path": os.path.join(self.REMOTE_BASE_URL, "coarse_2.pt"),
"checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
},
"fine": {
"path": os.path.join(self.REMOTE_BASE_URL, "fine_2.pt"),
"checksum": "59d184ed44e3650774a2f0503a48a97b",
},
}
self.LOCAL_MODEL_PATHS = {
"text": os.path.join(self.CACHE_DIR, "text_2.pt"),
"coarse": os.path.join(self.CACHE_DIR, "coarse_2.pt"),
"fine": os.path.join(self.CACHE_DIR, "fine_2.pt"),
"hubert_tokenizer": os.path.join(self.CACHE_DIR, "tokenizer.pth"),
"hubert": os.path.join(self.CACHE_DIR, "hubert.pt"),
}
self.SMALL_REMOTE_MODEL_PATHS = {
"text": {"path": os.path.join(self.REMOTE_BASE_URL, "text.pt")},
"coarse": {"path": os.path.join(self.REMOTE_BASE_URL, "coarse.pt")},
"fine": {"path": os.path.join(self.REMOTE_BASE_URL, "fine.pt")},
}
self.sample_rate = self.SAMPLE_RATE # pylint: disable=attribute-defined-outside-init

View File

@ -0,0 +1,170 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
@dataclass
class DelightfulTTSConfig(BaseTTSConfig):
"""
Configuration class for the DelightfulTTS model.
Attributes:
model (str): Name of the model ("delightful_tts").
audio (DelightfulTtsAudioConfig): Configuration for audio settings.
model_args (DelightfulTtsArgs): Configuration for model arguments.
use_attn_priors (bool): Whether to use attention priors.
vocoder (VocoderConfig): Configuration for the vocoder.
init_discriminator (bool): Whether to initialize the discriminator.
steps_to_start_discriminator (int): Number of steps to start the discriminator.
grad_clip (List[float]): Gradient clipping values.
lr_gen (float): Learning rate for the gan generator.
lr_disc (float): Learning rate for the gan discriminator.
lr_scheduler_gen (str): Name of the learning rate scheduler for the generator.
lr_scheduler_gen_params (dict): Parameters for the learning rate scheduler for the generator.
lr_scheduler_disc (str): Name of the learning rate scheduler for the discriminator.
lr_scheduler_disc_params (dict): Parameters for the learning rate scheduler for the discriminator.
scheduler_after_epoch (bool): Whether to schedule after each epoch.
optimizer (str): Name of the optimizer.
optimizer_params (dict): Parameters for the optimizer.
ssim_loss_alpha (float): Alpha value for the SSIM loss.
mel_loss_alpha (float): Alpha value for the mel loss.
aligner_loss_alpha (float): Alpha value for the aligner loss.
pitch_loss_alpha (float): Alpha value for the pitch loss.
energy_loss_alpha (float): Alpha value for the energy loss.
u_prosody_loss_alpha (float): Alpha value for the utterance prosody loss.
p_prosody_loss_alpha (float): Alpha value for the phoneme prosody loss.
dur_loss_alpha (float): Alpha value for the duration loss.
char_dur_loss_alpha (float): Alpha value for the character duration loss.
binary_align_loss_alpha (float): Alpha value for the binary alignment loss.
binary_loss_warmup_epochs (int): Number of warm-up epochs for the binary loss.
disc_loss_alpha (float): Alpha value for the discriminator loss.
gen_loss_alpha (float): Alpha value for the generator loss.
feat_loss_alpha (float): Alpha value for the feature loss.
vocoder_mel_loss_alpha (float): Alpha value for the vocoder mel loss.
multi_scale_stft_loss_alpha (float): Alpha value for the multi-scale STFT loss.
multi_scale_stft_loss_params (dict): Parameters for the multi-scale STFT loss.
return_wav (bool): Whether to return audio waveforms.
use_weighted_sampler (bool): Whether to use a weighted sampler.
weighted_sampler_attrs (dict): Attributes for the weighted sampler.
weighted_sampler_multipliers (dict): Multipliers for the weighted sampler.
r (int): Value for the `r` override.
compute_f0 (bool): Whether to compute F0 values.
f0_cache_path (str): Path to the F0 cache.
attn_prior_cache_path (str): Path to the attention prior cache.
num_speakers (int): Number of speakers.
use_speaker_embedding (bool): Whether to use speaker embedding.
speakers_file (str): Path to the speaker file.
speaker_embedding_channels (int): Number of channels for the speaker embedding.
language_ids_file (str): Path to the language IDs file.
"""
model: str = "delightful_tts"
# model specific params
audio: DelightfulTtsAudioConfig = field(default_factory=DelightfulTtsAudioConfig)
model_args: DelightfulTtsArgs = field(default_factory=DelightfulTtsArgs)
use_attn_priors: bool = True
# vocoder
vocoder: VocoderConfig = field(default_factory=VocoderConfig)
init_discriminator: bool = True
# optimizer
steps_to_start_discriminator: int = 200000
grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
lr_gen: float = 0.0002
lr_disc: float = 0.0002
lr_scheduler_gen: str = "ExponentialLR"
lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
lr_scheduler_disc: str = "ExponentialLR"
lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
scheduler_after_epoch: bool = True
optimizer: str = "AdamW"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
# acoustic model loss params
ssim_loss_alpha: float = 1.0
mel_loss_alpha: float = 1.0
aligner_loss_alpha: float = 1.0
pitch_loss_alpha: float = 1.0
energy_loss_alpha: float = 1.0
u_prosody_loss_alpha: float = 0.5
p_prosody_loss_alpha: float = 0.5
dur_loss_alpha: float = 1.0
char_dur_loss_alpha: float = 0.01
binary_align_loss_alpha: float = 0.1
binary_loss_warmup_epochs: int = 10
# vocoder loss params
disc_loss_alpha: float = 1.0
gen_loss_alpha: float = 1.0
feat_loss_alpha: float = 1.0
vocoder_mel_loss_alpha: float = 10.0
multi_scale_stft_loss_alpha: float = 2.5
multi_scale_stft_loss_params: dict = field(
default_factory=lambda: {
"n_ffts": [1024, 2048, 512],
"hop_lengths": [120, 240, 50],
"win_lengths": [600, 1200, 240],
}
)
# data loader params
return_wav: bool = True
use_weighted_sampler: bool = False
weighted_sampler_attrs: dict = field(default_factory=lambda: {})
weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
# overrides
r: int = 1
# dataset configs
compute_f0: bool = True
f0_cache_path: str = None
attn_prior_cache_path: str = None
# multi-speaker settings
# use speaker embedding layer
num_speakers: int = 0
use_speaker_embedding: bool = False
speakers_file: str = None
speaker_embedding_channels: int = 256
language_ids_file: str = None
use_language_embedding: bool = False
# use d-vectors
use_d_vector_file: bool = False
d_vector_file: str = None
d_vector_dim: int = None
# testing
test_sentences: List[List[str]] = field(
default_factory=lambda: [
["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
["Be a voice, not an echo."],
["I'm sorry Dave. I'm afraid I can't do that."],
["This cake is great. It's so delicious and moist."],
["Prior to November 22, 1963."],
]
)
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file

View File

@ -100,13 +100,20 @@ class FastPitchConfig(BaseTTSConfig):
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
# dataset configs
compute_f0(bool):
Compute pitch. defaults to True
f0_cache_path(str):
pith cache path. defaults to None
"""
model: str = "fast_pitch"
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs()
model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)
# multi-speaker settings
num_speakers: int = 0

View File

@ -107,7 +107,7 @@ class FastSpeechConfig(BaseTTSConfig):
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))
# multi-speaker settings
num_speakers: int = 0

View File

@ -0,0 +1,198 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.forward_tts import ForwardTTSArgs
@dataclass
class Fastspeech2Config(BaseTTSConfig):
"""Configure `ForwardTTS` as FastPitch model.
Example:
>>> from TTS.tts.configs.fastspeech2_config import FastSpeech2Config
>>> config = FastSpeech2Config()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
base_model (str):
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
model_args (Coqpit):
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
data_dep_init_steps (int):
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10.
speakers_file (str):
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
speaker names. Defaults to `None`.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
d_vector_dim (int):
Dimension of the external speaker embeddings. Defaults to 0.
optimizer (str):
Name of the model optimizer. Defaults to `Adam`.
optimizer_params (dict):
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
lr_scheduler (str):
Name of the learning rate scheduler. Defaults to `Noam`.
lr_scheduler_params (dict):
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
lr (float):
Initial learning rate. Defaults to `1e-3`.
grad_clip (float):
Gradient norm clipping value. Defaults to `5.0`.
spec_loss_type (str):
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
duration_loss_type (str):
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
use_ssim_loss (bool):
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
ssim_loss_alpha (float):
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
dur_loss_alpha (float):
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
spec_loss_alpha (float):
Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
pitch_loss_alpha (float):
Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
energy_loss_alpha (float):
Weight for the energy predictor's loss. If set 0, disables the energy predictor. Defaults to 1.0.
binary_align_loss_alpha (float):
Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
binary_loss_warmup_epochs (float):
Number of epochs to gradually increase the binary loss impact. Defaults to 150.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
# dataset configs
compute_f0(bool):
Compute pitch. defaults to True
f0_cache_path(str):
pith cache path. defaults to None
# dataset configs
compute_energy(bool):
Compute energy. defaults to True
energy_cache_path(str):
energy cache path. defaults to None
"""
model: str = "fastspeech2"
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))
# multi-speaker settings
num_speakers: int = 0
speakers_file: str = None
use_speaker_embedding: bool = False
use_d_vector_file: bool = False
d_vector_file: str = False
d_vector_dim: int = 0
# optimizer parameters
optimizer: str = "Adam"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
lr_scheduler: str = "NoamLR"
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
lr: float = 1e-4
grad_clip: float = 5.0
# loss params
spec_loss_type: str = "mse"
duration_loss_type: str = "mse"
use_ssim_loss: bool = True
ssim_loss_alpha: float = 1.0
spec_loss_alpha: float = 1.0
aligner_loss_alpha: float = 1.0
pitch_loss_alpha: float = 0.1
energy_loss_alpha: float = 0.1
dur_loss_alpha: float = 0.1
binary_align_loss_alpha: float = 0.1
binary_loss_warmup_epochs: int = 150
# overrides
min_seq_len: int = 13
max_seq_len: int = 200
r: int = 1 # DO NOT CHANGE
# dataset configs
compute_f0: bool = True
f0_cache_path: str = None
# dataset configs
compute_energy: bool = True
energy_cache_path: str = None
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963.",
]
)
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file

View File

@ -0,0 +1,170 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
@dataclass
class NeuralhmmTTSConfig(BaseTTSConfig):
"""
Define parameters for Neural HMM TTS model.
Example:
>>> from TTS.tts.configs.overflow_config import OverflowConfig
>>> config = OverflowConfig()
Args:
model (str):
Model name used to select the right model class to initilize. Defaults to `Overflow`.
run_eval_steps (int):
Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
save_step (int):
Save local checkpoint every save_step steps. Defaults to 500.
plot_step (int):
Plot training stats on the logger every plot_step steps. Defaults to 1.
model_param_stats (bool):
Log model parameters stats on the logger dashboard. Defaults to False.
force_generate_statistics (bool):
Force generate mel normalization statistics. Defaults to False.
mel_statistics_parameter_path (str):
Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
Defaults to None.
num_chars (int):
Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
state_per_phone (int):
Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
encoder_in_out_features (int):
Channels of encoder input and character embedding tensors. Defaults to 512.
encoder_n_convolutions (int):
Number of convolution layers in the encoder. Defaults to 3.
out_channels (int):
Channels of the final model output. It must match the spectragram size. Defaults to 80.
ar_order (int):
Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
sampling_temp (float):
Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
deterministic_transition (bool):
deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
duration_threshold (float):
Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
use_grad_checkpointing (bool):
Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
max_sampling_time (int):
Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
prenet_type (str):
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
Prenet. Defaults to `original`.
prenet_dim (int):
Dimension of the Prenet. Defaults to 256.
prenet_n_layers (int):
Number of layers in the Prenet. Defaults to 2.
prenet_dropout (float):
Dropout rate of the Prenet. Defaults to 0.5.
prenet_dropout_at_inference (bool):
Use dropout at inference time. Defaults to False.
memory_rnn_dim (int):
Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
outputnet_size (list[int]):
Size of the output network inside the neural HMM. Defaults to [1024].
flat_start_params (dict):
Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
It will be recomputed when you pass the dataset.
std_floor (float):
Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
It is called `variance flooring` in standard HMM literature.
optimizer (str):
Optimizer to use for training. Defaults to `adam`.
optimizer_params (dict):
Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
grad_clip (float):
Gradient clipping threshold. Defaults to 40_000.
lr (float):
Learning rate. Defaults to 1e-3.
lr_scheduler (str):
Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
`TTS.utils.training`. Defaults to `None`.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
"""
model: str = "NeuralHMM_TTS"
# Training and Checkpoint configs
run_eval_steps: int = 100
save_step: int = 500
plot_step: int = 1
model_param_stats: bool = False
# data parameters
force_generate_statistics: bool = False
mel_statistics_parameter_path: str = None
# Encoder parameters
num_chars: int = None
state_per_phone: int = 2
encoder_in_out_features: int = 512
encoder_n_convolutions: int = 3
# HMM parameters
out_channels: int = 80
ar_order: int = 1
sampling_temp: float = 0
deterministic_transition: bool = True
duration_threshold: float = 0.43
use_grad_checkpointing: bool = True
max_sampling_time: int = 1000
## Prenet parameters
prenet_type: str = "original"
prenet_dim: int = 256
prenet_n_layers: int = 2
prenet_dropout: float = 0.5
prenet_dropout_at_inference: bool = True
memory_rnn_dim: int = 1024
## Outputnet parameters
outputnet_size: List[int] = field(default_factory=lambda: [1024])
flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
std_floor: float = 0.001
# optimizer parameters
optimizer: str = "Adam"
optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
grad_clip: float = 40000.0
lr: float = 1e-3
lr_scheduler: str = None
# overrides
min_text_len: int = 10
max_text_len: int = 500
min_audio_len: int = 512
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"Be a voice, not an echo.",
]
)
# Extra needed config
r: int = 1
use_d_vector_file: bool = False
use_speaker_embedding: bool = False
def check_values(self):
"""Validate the hyperparameters.
Raises:
AssertionError: when the parameters network is not defined
AssertionError: transition probability is not between 0 and 1
"""
assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
assert (
len(self.outputnet_size) >= 1
), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
assert (
0 < self.flat_start_params["transition_p"] < 1
), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"

View File

@ -0,0 +1,201 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
@dataclass
class OverflowConfig(BaseTTSConfig): # The classname has to be camel case
"""
Define parameters for OverFlow model.
Example:
>>> from TTS.tts.configs.overflow_config import OverflowConfig
>>> config = OverflowConfig()
Args:
model (str):
Model name used to select the right model class to initilize. Defaults to `Overflow`.
run_eval_steps (int):
Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
save_step (int):
Save local checkpoint every save_step steps. Defaults to 500.
plot_step (int):
Plot training stats on the logger every plot_step steps. Defaults to 1.
model_param_stats (bool):
Log model parameters stats on the logger dashboard. Defaults to False.
force_generate_statistics (bool):
Force generate mel normalization statistics. Defaults to False.
mel_statistics_parameter_path (str):
Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
Defaults to None.
num_chars (int):
Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
state_per_phone (int):
Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
encoder_in_out_features (int):
Channels of encoder input and character embedding tensors. Defaults to 512.
encoder_n_convolutions (int):
Number of convolution layers in the encoder. Defaults to 3.
out_channels (int):
Channels of the final model output. It must match the spectragram size. Defaults to 80.
ar_order (int):
Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
sampling_temp (float):
Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
deterministic_transition (bool):
deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
duration_threshold (float):
Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
use_grad_checkpointing (bool):
Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
max_sampling_time (int):
Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
prenet_type (str):
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
Prenet. Defaults to `original`.
prenet_dim (int):
Dimension of the Prenet. Defaults to 256.
prenet_n_layers (int):
Number of layers in the Prenet. Defaults to 2.
prenet_dropout (float):
Dropout rate of the Prenet. Defaults to 0.5.
prenet_dropout_at_inference (bool):
Use dropout at inference time. Defaults to False.
memory_rnn_dim (int):
Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
outputnet_size (list[int]):
Size of the output network inside the neural HMM. Defaults to [1024].
flat_start_params (dict):
Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
It will be recomputed when you pass the dataset.
std_floor (float):
Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
It is called `variance flooring` in standard HMM literature.
hidden_channels_dec (int):
Number of base hidden channels used by the decoder WaveNet network. Defaults to 150.
kernel_size_dec (int):
Decoder kernel size. Defaults to 5
dilation_rate (int):
Rate to increase dilation by each layer in a decoder block. Defaults to 1.
num_flow_blocks_dec (int):
Number of decoder layers in each decoder block. Defaults to 4.
dropout_p_dec (float):
Dropout rate of the decoder. Defaults to 0.05.
num_splits (int):
Number of split levels in inversible conv1x1 operation. Defaults to 4.
num_squeeze (int):
Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
'num_squeeze'. Defaults to 2.
sigmoid_scale (bool):
enable/disable sigmoid scaling in decoder. Defaults to False.
c_in_channels (int):
Unused parameter from GlowTTS's decoder. Defaults to 0.
optimizer (str):
Optimizer to use for training. Defaults to `adam`.
optimizer_params (dict):
Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
grad_clip (float):
Gradient clipping threshold. Defaults to 40_000.
lr (float):
Learning rate. Defaults to 1e-3.
lr_scheduler (str):
Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
`TTS.utils.training`. Defaults to `None`.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
"""
model: str = "Overflow"
# Training and Checkpoint configs
run_eval_steps: int = 100
save_step: int = 500
plot_step: int = 1
model_param_stats: bool = False
# data parameters
force_generate_statistics: bool = False
mel_statistics_parameter_path: str = None
# Encoder parameters
num_chars: int = None
state_per_phone: int = 2
encoder_in_out_features: int = 512
encoder_n_convolutions: int = 3
# HMM parameters
out_channels: int = 80
ar_order: int = 1
sampling_temp: float = 0.334
deterministic_transition: bool = True
duration_threshold: float = 0.55
use_grad_checkpointing: bool = True
max_sampling_time: int = 1000
## Prenet parameters
prenet_type: str = "original"
prenet_dim: int = 256
prenet_n_layers: int = 2
prenet_dropout: float = 0.5
prenet_dropout_at_inference: bool = False
memory_rnn_dim: int = 1024
## Outputnet parameters
outputnet_size: List[int] = field(default_factory=lambda: [1024])
flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
std_floor: float = 0.01
# Decoder parameters
hidden_channels_dec: int = 150
kernel_size_dec: int = 5
dilation_rate: int = 1
num_flow_blocks_dec: int = 12
num_block_layers: int = 4
dropout_p_dec: float = 0.05
num_splits: int = 4
num_squeeze: int = 2
sigmoid_scale: bool = False
c_in_channels: int = 0
# optimizer parameters
optimizer: str = "Adam"
optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
grad_clip: float = 40000.0
lr: float = 1e-3
lr_scheduler: str = None
# overrides
min_text_len: int = 10
max_text_len: int = 500
min_audio_len: int = 512
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"Be a voice, not an echo.",
]
)
# Extra needed config
r: int = 1
use_d_vector_file: bool = False
use_speaker_embedding: bool = False
def check_values(self):
"""Validate the hyperparameters.
Raises:
AssertionError: when the parameters network is not defined
AssertionError: transition probability is not between 0 and 1
"""
assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
assert (
len(self.outputnet_size) >= 1
), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
assert (
0 < self.flat_start_params["transition_p"] < 1
), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"

View File

@ -217,6 +217,9 @@ class BaseTTSConfig(BaseTrainingConfig):
compute_f0 (int):
(Not in use yet).
compute_energy (int):
(Not in use yet).
compute_linear_spec (bool):
If True data loader computes and returns linear spectrograms alongside the other data.
@ -230,6 +233,13 @@ class BaseTTSConfig(BaseTrainingConfig):
If True, the data loader will start loading the longest batch first. It is useful for checking OOM issues.
Defaults to False.
shuffle (bool):
If True, the data loader will shuffle the dataset when there is not sampler defined. Defaults to True.
drop_last (bool):
If True, the data loader will drop the last batch if it is not complete. It helps to prevent
issues that emerge from the partial batch statistics. Defaults to True.
add_blank (bool):
Add blank characters between each other two characters. It improves performance for some models at expense
of slower run-time due to the longer input sequence.
@ -305,17 +315,20 @@ class BaseTTSConfig(BaseTrainingConfig):
min_text_len: int = 1
max_text_len: int = float("inf")
compute_f0: bool = False
compute_energy: bool = False
compute_linear_spec: bool = False
precompute_num_workers: int = 0
use_noise_augment: bool = False
start_by_longest: bool = False
shuffle: bool = False
drop_last: bool = False
# dataset
datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
# optimizer
optimizer: str = "radam"
optimizer_params: dict = None
# scheduler
lr_scheduler: str = ""
lr_scheduler: str = None
lr_scheduler_params: dict = field(default_factory=lambda: {})
# testing
test_sentences: List[str] = field(default_factory=lambda: [])

View File

@ -103,26 +103,28 @@ class SpeedySpeechConfig(BaseTTSConfig):
base_model: str = "forward_tts"
# set model args as SpeedySpeech
model_args: ForwardTTSArgs = ForwardTTSArgs(
use_pitch=False,
encoder_type="residual_conv_bn",
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13,
},
decoder_type="residual_conv_bn",
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17,
},
out_channels=80,
hidden_channels=128,
positional_encoding=True,
detach_duration_predictor=True,
model_args: ForwardTTSArgs = field(
default_factory=lambda: ForwardTTSArgs(
use_pitch=False,
encoder_type="residual_conv_bn",
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13,
},
decoder_type="residual_conv_bn",
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17,
},
out_channels=80,
hidden_channels=128,
positional_encoding=True,
detach_duration_predictor=True,
)
)
# multi-speaker settings

View File

@ -0,0 +1,87 @@
from dataclasses import dataclass, field
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.tortoise import TortoiseArgs, TortoiseAudioConfig
@dataclass
class TortoiseConfig(BaseTTSConfig):
"""Defines parameters for Tortoise TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (TortoiseArgs):
Model architecture arguments. Defaults to `TortoiseArgs()`.
audio (TortoiseAudioConfig):
Audio processing configuration. Defaults to `TortoiseAudioConfig()`.
model_dir (str):
Path to the folder that has all the Tortoise models. Defaults to None.
temperature (float):
Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
length_penalty (float):
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
reperation_penalty (float):
The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
top_p (float):
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
Defaults to `0.8`.
cond_free_k (float):
Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
diffusion_temperature (float):
Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
are the "mean" prediction of the diffusion network and will sound bland and smeared.
Defaults to `1.0`.
num_autoregressive_samples (int):
Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
Defaults to `16`.
diffusion_iterations (int):
Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
however. Defaults to `30`.
sampler (str):
Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.tts.configs.tortoise_config import TortoiseConfig
>>> config = TortoiseConfig()
"""
model: str = "tortoise"
# model specific params
model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
model_dir: str = None
# settings
temperature: float = 0.2
length_penalty: float = 1.0
repetition_penalty: float = 2.0
top_p: float = 0.8
cond_free_k: float = 2.0
diffusion_temperature: float = 1.0
# inference params
num_autoregressive_samples: int = 16
diffusion_iterations: int = 30
sampler: str = "ddim"

View File

@ -109,7 +109,7 @@ class VitsConfig(BaseTTSConfig):
model: str = "vits"
# model specific params
model_args: VitsArgs = field(default_factory=VitsArgs)
audio: VitsAudioConfig = VitsAudioConfig()
audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
# optimizer
grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
@ -167,7 +167,7 @@ class VitsConfig(BaseTTSConfig):
# use d-vectors
use_d_vector_file: bool = False
d_vector_file: str = None
d_vector_file: List[str] = None
d_vector_dim: int = None
def __post_init__(self):

View File

@ -0,0 +1,107 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
@dataclass
class XttsConfig(BaseTTSConfig):
"""Defines parameters for XTTS TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (XttsArgs):
Model architecture arguments. Defaults to `XttsArgs()`.
audio (XttsAudioConfig):
Audio processing configuration. Defaults to `XttsAudioConfig()`.
model_dir (str):
Path to the folder that has all the XTTS models. Defaults to None.
temperature (float):
Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
length_penalty (float):
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
repetition_penalty (float):
The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
top_p (float):
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
Defaults to `0.8`.
num_gpt_outputs (int):
Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
Defaults to `16`.
gpt_cond_len (int):
Secs audio to be used as conditioning for the autoregressive model. Defaults to `12`.
gpt_cond_chunk_len (int):
Audio chunk size in secs. Audio is split into chunks and latents are extracted for each chunk. Then the
latents are averaged. Chunking improves the stability. It must be <= gpt_cond_len.
If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to `4`.
max_ref_len (int):
Maximum number of seconds of audio to be used as conditioning for the decoder. Defaults to `10`.
sound_norm_refs (bool):
Whether to normalize the conditioning audio. Defaults to `False`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.tts.configs.xtts_config import XttsConfig
>>> config = XttsConfig()
"""
model: str = "xtts"
# model specific params
model_args: XttsArgs = field(default_factory=XttsArgs)
audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
model_dir: str = None
languages: List[str] = field(
default_factory=lambda: [
"en",
"es",
"fr",
"de",
"it",
"pt",
"pl",
"tr",
"ru",
"nl",
"cs",
"ar",
"zh-cn",
"hu",
"ko",
"ja",
"hi",
]
)
# inference params
temperature: float = 0.85
length_penalty: float = 1.0
repetition_penalty: float = 2.0
top_k: int = 50
top_p: float = 0.85
num_gpt_outputs: int = 1
# cloning
gpt_cond_len: int = 12
gpt_cond_chunk_len: int = 4
max_ref_len: int = 10
sound_norm_refs: bool = False

View File

@ -1,3 +1,4 @@
import os
import sys
from collections import Counter
from pathlib import Path
@ -12,20 +13,16 @@ from TTS.tts.datasets.formatters import *
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
Args:
<<<<<<< HEAD
items (List[List]):
A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
Args:
items (List[List]):
A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
=======
items (List[List]): A list of samples. Each sample is a list of `[text, audio_path, speaker_id]`.
>>>>>>> Fix docstring
eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
"""
speakers = [item["speaker_name"] for item in items]
is_multi_speaker = len(set(speakers)) > 1
@ -59,6 +56,17 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
return items[:eval_split_size], items[eval_split_size:]
def add_extra_keys(metadata, language, dataset_name):
for item in metadata:
# add language name
item["language"] = language
# add unique audio name
relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
audio_unique_name = f"{dataset_name}#{relfilepath}"
item["audio_unique_name"] = audio_unique_name
return metadata
def load_tts_samples(
datasets: Union[List[Dict], Dict],
eval_split=True,
@ -97,7 +105,8 @@ def load_tts_samples(
if not isinstance(datasets, list):
datasets = [datasets]
for dataset in datasets:
name = dataset["name"]
formatter_name = dataset["formatter"]
dataset_name = dataset["dataset_name"]
root_path = dataset["path"]
meta_file_train = dataset["meta_file_train"]
meta_file_val = dataset["meta_file_val"]
@ -106,19 +115,22 @@ def load_tts_samples(
# setup the right data processor
if formatter is None:
formatter = _get_formatter_by_name(name)
formatter = _get_formatter_by_name(formatter_name)
# load train set
meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers)
meta_data_train = [{**item, **{"language": language}} for item in meta_data_train]
assert len(meta_data_train) > 0, f" [!] No training samples found in {root_path}/{meta_file_train}"
meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
# load evaluation split if set
if eval_split:
if meta_file_val:
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
meta_data_eval = [{**item, **{"language": language}} for item in meta_data_eval]
meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
else:
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
eval_size_per_dataset = eval_split_max_size // len(datasets) if eval_split_max_size else None
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_size_per_dataset, eval_split_size)
meta_data_eval_all += meta_data_eval
meta_data_train_all += meta_data_train
# load attention masks for the duration predictor training

View File

@ -1,3 +1,4 @@
import base64
import collections
import os
import random
@ -10,6 +11,9 @@ from torch.utils.data import Dataset
from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
from TTS.utils.audio import AudioProcessor
from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
import mutagen
# to prevent too many open files error as suggested here
# https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
@ -34,6 +38,21 @@ def noise_augment_audio(wav):
return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
def string2filename(string):
# generate a safe and reversible filename based on a string
filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
return filename
def get_audio_size(audiopath):
extension = audiopath.rpartition(".")[-1].lower()
if extension not in {"mp3", "wav", "flac"}:
raise RuntimeError(f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!")
audio_info = mutagen.File(audiopath).info
return int(audio_info.length * audio_info.sample_rate)
class TTSDataset(Dataset):
def __init__(
self,
@ -43,7 +62,9 @@ class TTSDataset(Dataset):
samples: List[Dict] = None,
tokenizer: "TTSTokenizer" = None,
compute_f0: bool = False,
compute_energy: bool = False,
f0_cache_path: str = None,
energy_cache_path: str = None,
return_wav: bool = False,
batch_group_size: int = 0,
min_text_len: int = 0,
@ -77,8 +98,12 @@ class TTSDataset(Dataset):
compute_f0 (bool): compute f0 if True. Defaults to False.
compute_energy (bool): compute energy if True. Defaults to False.
f0_cache_path (str): Path to store f0 cache. Defaults to None.
energy_cache_path (str): Path to store energy cache. Defaults to None.
return_wav (bool): Return the waveform of the sample. Defaults to False.
batch_group_size (int): Range of batch randomization after sorting
@ -121,7 +146,9 @@ class TTSDataset(Dataset):
self.compute_linear_spec = compute_linear_spec
self.return_wav = return_wav
self.compute_f0 = compute_f0
self.compute_energy = compute_energy
self.f0_cache_path = f0_cache_path
self.energy_cache_path = energy_cache_path
self.min_audio_len = min_audio_len
self.max_audio_len = max_audio_len
self.min_text_len = min_text_len
@ -148,7 +175,10 @@ class TTSDataset(Dataset):
self.f0_dataset = F0Dataset(
self.samples, self.ap, cache_path=f0_cache_path, precompute_num_workers=precompute_num_workers
)
if compute_energy:
self.energy_dataset = EnergyDataset(
self.samples, self.ap, cache_path=energy_cache_path, precompute_num_workers=precompute_num_workers
)
if self.verbose:
self.print_logs()
@ -157,7 +187,7 @@ class TTSDataset(Dataset):
lens = []
for item in self.samples:
_, wav_file, *_ = _parse_sample(item)
audio_len = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio
audio_len = get_audio_size(wav_file)
lens.append(audio_len)
return lens
@ -170,6 +200,8 @@ class TTSDataset(Dataset):
self._samples = new_samples
if hasattr(self, "f0_dataset"):
self.f0_dataset.samples = new_samples
if hasattr(self, "energy_dataset"):
self.energy_dataset.samples = new_samples
if hasattr(self, "phoneme_dataset"):
self.phoneme_dataset.samples = new_samples
@ -201,7 +233,13 @@ class TTSDataset(Dataset):
def get_f0(self, idx):
out_dict = self.f0_dataset[idx]
item = self.samples[idx]
assert item["audio_file"] == out_dict["audio_file"]
assert item["audio_unique_name"] == out_dict["audio_unique_name"]
return out_dict
def get_energy(self, idx):
out_dict = self.energy_dataset[idx]
item = self.samples[idx]
assert item["audio_unique_name"] == out_dict["audio_unique_name"]
return out_dict
@staticmethod
@ -245,17 +283,22 @@ class TTSDataset(Dataset):
f0 = None
if self.compute_f0:
f0 = self.get_f0(idx)["f0"]
energy = None
if self.compute_energy:
energy = self.get_energy(idx)["energy"]
sample = {
"raw_text": raw_text,
"token_ids": token_ids,
"wav": wav,
"pitch": f0,
"energy": energy,
"attn": attn,
"item_idx": item["audio_file"],
"speaker_name": item["speaker_name"],
"language_name": item["language"],
"wav_file_name": os.path.basename(item["audio_file"]),
"audio_unique_name": item["audio_unique_name"],
}
return sample
@ -263,7 +306,7 @@ class TTSDataset(Dataset):
def _compute_lengths(samples):
new_samples = []
for item in samples:
audio_length = os.path.getsize(item["audio_file"]) / 16 * 8 # assuming 16bit audio
audio_length = get_audio_size(item["audio_file"])
text_lenght = len(item["text"])
item["audio_length"] = audio_length
item["text_length"] = text_lenght
@ -381,7 +424,6 @@ class TTSDataset(Dataset):
# Puts each data field into a tensor with outer dimension batch size
if isinstance(batch[0], collections.abc.Mapping):
token_ids_lengths = np.array([len(d["token_ids"]) for d in batch])
# sort items with text input length for RNN efficiency
@ -397,8 +439,8 @@ class TTSDataset(Dataset):
language_ids = None
# get pre-computed d-vectors
if self.d_vector_mapping is not None:
wav_files_names = list(batch["wav_file_name"])
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names]
embedding_keys = list(batch["audio_unique_name"])
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in embedding_keys]
else:
d_vectors = None
@ -482,7 +524,13 @@ class TTSDataset(Dataset):
pitch = torch.FloatTensor(pitch)[:, None, :].contiguous() # B x 1 xT
else:
pitch = None
# format energy
if self.compute_energy:
energy = prepare_data(batch["energy"])
assert mel.shape[1] == energy.shape[1], f"[!] {mel.shape} vs {energy.shape}"
energy = torch.FloatTensor(energy)[:, None, :].contiguous() # B x 1 xT
else:
energy = None
# format attention masks
attns = None
if batch["attn"][0] is not None:
@ -511,7 +559,9 @@ class TTSDataset(Dataset):
"waveform": wav_padded,
"raw_text": batch["raw_text"],
"pitch": pitch,
"energy": energy,
"language_ids": language_ids,
"audio_unique_names": batch["audio_unique_name"],
}
raise TypeError(
@ -560,25 +610,24 @@ class PhonemeDataset(Dataset):
def __getitem__(self, index):
item = self.samples[index]
ids = self.compute_or_load(item["audio_file"], item["text"])
ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"], item["language"])
ph_hat = self.tokenizer.ids_to_text(ids)
return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
def __len__(self):
return len(self.samples)
def compute_or_load(self, wav_file, text):
def compute_or_load(self, file_name, text, language):
"""Compute phonemes for the given text.
If the phonemes are already cached, load them from cache.
"""
file_name = os.path.splitext(os.path.basename(wav_file))[0]
file_ext = "_phoneme.npy"
cache_path = os.path.join(self.cache_path, file_name + file_ext)
try:
ids = np.load(cache_path)
except FileNotFoundError:
ids = self.tokenizer.text_to_ids(text)
ids = self.tokenizer.text_to_ids(text, language=language)
np.save(cache_path, ids)
return ids
@ -648,6 +697,7 @@ class F0Dataset:
self,
samples: Union[List[List], List[Dict]],
ap: "AudioProcessor",
audio_config=None, # pylint: disable=unused-argument
verbose=False,
cache_path: str = None,
precompute_num_workers=0,
@ -669,11 +719,11 @@ class F0Dataset:
def __getitem__(self, idx):
item = self.samples[idx]
f0 = self.compute_or_load(item["audio_file"])
f0 = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
if self.normalize_f0:
assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
f0 = self.normalize(f0)
return {"audio_file": item["audio_file"], "f0": f0}
return {"audio_unique_name": item["audio_unique_name"], "f0": f0}
def __len__(self):
return len(self.samples)
@ -705,8 +755,7 @@ class F0Dataset:
return self.pad_id
@staticmethod
def create_pitch_file_path(wav_file, cache_path):
file_name = os.path.splitext(os.path.basename(wav_file))[0]
def create_pitch_file_path(file_name, cache_path):
pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
return pitch_file
@ -744,11 +793,11 @@ class F0Dataset:
pitch[zero_idxs] = 0.0
return pitch
def compute_or_load(self, wav_file):
def compute_or_load(self, wav_file, audio_unique_name):
"""
compute pitch and return a numpy array of pitch values
"""
pitch_file = self.create_pitch_file_path(wav_file, self.cache_path)
pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
if not os.path.exists(pitch_file):
pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
else:
@ -756,17 +805,169 @@ class F0Dataset:
return pitch.astype(np.float32)
def collate_fn(self, batch):
audio_file = [item["audio_file"] for item in batch]
audio_unique_name = [item["audio_unique_name"] for item in batch]
f0s = [item["f0"] for item in batch]
f0_lens = [len(item["f0"]) for item in batch]
f0_lens_max = max(f0_lens)
f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id())
for i, f0_len in enumerate(f0_lens):
f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i])
return {"audio_file": audio_file, "f0": f0s_torch, "f0_lens": f0_lens}
return {"audio_unique_name": audio_unique_name, "f0": f0s_torch, "f0_lens": f0_lens}
def print_logs(self, level: int = 0) -> None:
indent = "\t" * level
print("\n")
print(f"{indent}> F0Dataset ")
print(f"{indent}| > Number of instances : {len(self.samples)}")
class EnergyDataset:
"""Energy Dataset for computing Energy from wav files in CPU
Pre-compute Energy values for all the samples at initialization if `cache_path` is not None or already present. It
also computes the mean and std of Energy values if `normalize_Energy` is True.
Args:
samples (Union[List[List], List[Dict]]):
List of samples. Each sample is a list or a dict.
ap (AudioProcessor):
AudioProcessor to compute Energy from wav files.
cache_path (str):
Path to cache Energy values. If `cache_path` is already present or None, it skips the pre-computation.
Defaults to None.
precompute_num_workers (int):
Number of workers used for pre-computing the Energy values. Defaults to 0.
normalize_Energy (bool):
Whether to normalize Energy values by mean and std. Defaults to True.
"""
def __init__(
self,
samples: Union[List[List], List[Dict]],
ap: "AudioProcessor",
verbose=False,
cache_path: str = None,
precompute_num_workers=0,
normalize_energy=True,
):
self.samples = samples
self.ap = ap
self.verbose = verbose
self.cache_path = cache_path
self.normalize_energy = normalize_energy
self.pad_id = 0.0
self.mean = None
self.std = None
if cache_path is not None and not os.path.exists(cache_path):
os.makedirs(cache_path)
self.precompute(precompute_num_workers)
if normalize_energy:
self.load_stats(cache_path)
def __getitem__(self, idx):
item = self.samples[idx]
energy = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
if self.normalize_energy:
assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
energy = self.normalize(energy)
return {"audio_unique_name": item["audio_unique_name"], "energy": energy}
def __len__(self):
return len(self.samples)
def precompute(self, num_workers=0):
print("[*] Pre-computing energys...")
with tqdm.tqdm(total=len(self)) as pbar:
batch_size = num_workers if num_workers > 0 else 1
# we do not normalize at preproessing
normalize_energy = self.normalize_energy
self.normalize_energy = False
dataloder = torch.utils.data.DataLoader(
batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
)
computed_data = []
for batch in dataloder:
energy = batch["energy"]
computed_data.append(e for e in energy)
pbar.update(batch_size)
self.normalize_energy = normalize_energy
if self.normalize_energy:
computed_data = [tensor for batch in computed_data for tensor in batch] # flatten
energy_mean, energy_std = self.compute_energy_stats(computed_data)
energy_stats = {"mean": energy_mean, "std": energy_std}
np.save(os.path.join(self.cache_path, "energy_stats"), energy_stats, allow_pickle=True)
def get_pad_id(self):
return self.pad_id
@staticmethod
def create_energy_file_path(wav_file, cache_path):
file_name = os.path.splitext(os.path.basename(wav_file))[0]
energy_file = os.path.join(cache_path, file_name + "_energy.npy")
return energy_file
@staticmethod
def _compute_and_save_energy(ap, wav_file, energy_file=None):
wav = ap.load_wav(wav_file)
energy = calculate_energy(wav, fft_size=ap.fft_size, hop_length=ap.hop_length, win_length=ap.win_length)
if energy_file:
np.save(energy_file, energy)
return energy
@staticmethod
def compute_energy_stats(energy_vecs):
nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in energy_vecs])
mean, std = np.mean(nonzeros), np.std(nonzeros)
return mean, std
def load_stats(self, cache_path):
stats_path = os.path.join(cache_path, "energy_stats.npy")
stats = np.load(stats_path, allow_pickle=True).item()
self.mean = stats["mean"].astype(np.float32)
self.std = stats["std"].astype(np.float32)
def normalize(self, energy):
zero_idxs = np.where(energy == 0.0)[0]
energy = energy - self.mean
energy = energy / self.std
energy[zero_idxs] = 0.0
return energy
def denormalize(self, energy):
zero_idxs = np.where(energy == 0.0)[0]
energy *= self.std
energy += self.mean
energy[zero_idxs] = 0.0
return energy
def compute_or_load(self, wav_file, audio_unique_name):
"""
compute energy and return a numpy array of energy values
"""
energy_file = self.create_energy_file_path(audio_unique_name, self.cache_path)
if not os.path.exists(energy_file):
energy = self._compute_and_save_energy(self.ap, wav_file, energy_file)
else:
energy = np.load(energy_file)
return energy.astype(np.float32)
def collate_fn(self, batch):
audio_unique_name = [item["audio_unique_name"] for item in batch]
energys = [item["energy"] for item in batch]
energy_lens = [len(item["energy"]) for item in batch]
energy_lens_max = max(energy_lens)
energys_torch = torch.LongTensor(len(energys), energy_lens_max).fill_(self.get_pad_id())
for i, energy_len in enumerate(energy_lens):
energys_torch[i, :energy_len] = torch.LongTensor(energys[i])
return {"audio_unique_name": audio_unique_name, "energy": energys_torch, "energy_lens": energy_lens}
def print_logs(self, level: int = 0) -> None:
indent = "\t" * level
print("\n")
print(f"{indent}> energyDataset ")
print(f"{indent}| > Number of instances : {len(self.samples)}")

View File

@ -13,8 +13,56 @@ from tqdm import tqdm
########################
def cml_tts(root_path, meta_file, ignored_speakers=None):
"""Normalizes the CML-TTS meta data file to TTS format
https://github.com/freds0/CML-TTS-Dataset/"""
filepath = os.path.join(root_path, meta_file)
# ensure there are 4 columns for every line
with open(filepath, "r", encoding="utf8") as f:
lines = f.readlines()
num_cols = len(lines[0].split("|")) # take the first row as reference
for idx, line in enumerate(lines[1:]):
if len(line.split("|")) != num_cols:
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
# load metadata
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
assert all(x in metadata.columns for x in ["wav_filename", "transcript"])
client_id = None if "client_id" in metadata.columns else "default"
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
items = []
not_found_counter = 0
for row in metadata.itertuples():
if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers:
continue
audio_path = os.path.join(root_path, row.wav_filename)
if not os.path.exists(audio_path):
not_found_counter += 1
continue
items.append(
{
"text": row.transcript,
"audio_file": audio_path,
"speaker_name": client_id if client_id is not None else row.client_id,
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
"root_path": root_path,
}
)
if not_found_counter > 0:
print(f" | > [!] {not_found_counter} files not found")
return items
def coqui(root_path, meta_file, ignored_speakers=None):
"""Interal dataset formatter."""
filepath = os.path.join(root_path, meta_file)
# ensure there are 4 columns for every line
with open(filepath, "r", encoding="utf8") as f:
lines = f.readlines()
num_cols = len(lines[0].split("|")) # take the first row as reference
for idx, line in enumerate(lines[1:]):
if len(line.split("|")) != num_cols:
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
# load metadata
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
assert all(x in metadata.columns for x in ["audio_file", "text"])
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
@ -97,9 +145,9 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
meta_files (str): list of meta files to be used in the training. If None, finds all the csv files
recursively. Defaults to None
"""
speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
speaker_regex = re.compile(f"by_book{os.sep}(male|female){os.sep}(?P<speaker_name>[^{os.sep}]+){os.sep}")
if not meta_files:
csv_files = glob(root_path + "/**/metadata.csv", recursive=True)
csv_files = glob(root_path + f"{os.sep}**{os.sep}metadata.csv", recursive=True)
else:
csv_files = meta_files
@ -232,7 +280,7 @@ def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
cols = line.split("|")
wav_file = os.path.join(root_path, cols[0])
text = cols[1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
@ -246,7 +294,7 @@ def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
utt_id = line.split()[1]
text = line[line.find('"') + 1 : line.rfind('"') - 1]
wav_file = os.path.join(root_path, "wavn", utt_id + ".wav")
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
@ -578,3 +626,30 @@ def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
text = cols[2].replace(" ", "")
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Korean single-speaker dataset from https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "kss"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, cols[0])
text = cols[2] # cols[1] => 6월, cols[2] => 유월
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "bel_tts"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, cols[0])
text = cols[1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items

View File

View File

View File

@ -0,0 +1,35 @@
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
import os.path
import shutil
import urllib.request
import huggingface_hub
class HubertManager:
@staticmethod
def make_sure_hubert_installed(
download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
):
if not os.path.isfile(model_path):
print("Downloading HuBERT base model")
urllib.request.urlretrieve(download_url, model_path)
print("Downloaded HuBERT")
return model_path
return None
@staticmethod
def make_sure_tokenizer_installed(
model: str = "quantifier_hubert_base_ls960_14.pth",
repo: str = "GitMylo/bark-voice-cloning",
model_path: str = "",
):
model_dir = os.path.dirname(model_path)
if not os.path.isfile(model_path):
print("Downloading HuBERT custom tokenizer")
huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
shutil.move(os.path.join(model_dir, model), model_path)
print("Downloaded tokenizer")
return model_path
return None

View File

@ -0,0 +1,82 @@
"""
Modified HuBERT model without kmeans.
Original author: https://github.com/lucidrains/
Modified by: https://www.github.com/gitmylo/
License: MIT
"""
# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
import logging
from pathlib import Path
import torch
from einops import pack, unpack
from torch import nn
from torchaudio.functional import resample
from transformers import HubertModel
def round_down_nearest_multiple(num, divisor):
return num // divisor * divisor
def curtail_to_multiple(t, mult, from_left=False):
data_len = t.shape[-1]
rounded_seq_len = round_down_nearest_multiple(data_len, mult)
seq_slice = slice(None, rounded_seq_len) if not from_left else slice(-rounded_seq_len, None)
return t[..., seq_slice]
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
class CustomHubert(nn.Module):
"""
checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
or you can train your own
"""
def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None):
super().__init__()
self.target_sample_hz = target_sample_hz
self.seq_len_multiple_of = seq_len_multiple_of
self.output_layer = output_layer
if device is not None:
self.to(device)
self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
if device is not None:
self.model.to(device)
self.model.eval()
@property
def groups(self):
return 1
@torch.no_grad()
def forward(self, wav_input, flatten=True, input_sample_hz=None):
device = wav_input.device
if exists(input_sample_hz):
wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
if exists(self.seq_len_multiple_of):
wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
outputs = self.model.forward(
wav_input,
output_hidden_states=True,
)
embed = outputs["hidden_states"][self.output_layer]
embed, packed_shape = pack([embed], "* d")
codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
if flatten:
return codebook_indices
(codebook_indices,) = unpack(codebook_indices, packed_shape, "*")
return codebook_indices

View File

@ -0,0 +1,195 @@
"""
Custom tokenizer model.
Author: https://www.github.com/gitmylo/
License: MIT
"""
import json
import os.path
from zipfile import ZipFile
import numpy
import torch
from torch import nn, optim
class HubertTokenizer(nn.Module):
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
super().__init__()
next_size = input_size
if version == 0:
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
next_size = hidden_size
if version == 1:
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
self.intermediate = nn.Linear(hidden_size, 4096)
next_size = 4096
self.fc = nn.Linear(next_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
self.optimizer: optim.Optimizer = None
self.lossfunc = nn.CrossEntropyLoss()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.version = version
def forward(self, x):
x, _ = self.lstm(x)
if self.version == 1:
x = self.intermediate(x)
x = self.fc(x)
x = self.softmax(x)
return x
@torch.no_grad()
def get_token(self, x):
"""
Used to get the token for the first
:param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
:return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
"""
return torch.argmax(self(x), dim=1)
def prepare_training(self):
self.optimizer = optim.Adam(self.parameters(), 0.001)
def train_step(self, x_train, y_train, log_loss=False):
# y_train = y_train[:-1]
# y_train = y_train[1:]
optimizer = self.optimizer
lossfunc = self.lossfunc
# Zero the gradients
self.zero_grad()
# Forward pass
y_pred = self(x_train)
y_train_len = len(y_train)
y_pred_len = y_pred.shape[0]
if y_train_len > y_pred_len:
diff = y_train_len - y_pred_len
y_train = y_train[diff:]
elif y_train_len < y_pred_len:
diff = y_pred_len - y_train_len
y_pred = y_pred[:-diff, :]
y_train_hot = torch.zeros(len(y_train), self.output_size)
y_train_hot[range(len(y_train)), y_train] = 1
y_train_hot = y_train_hot.to("cuda")
# Calculate the loss
loss = lossfunc(y_pred, y_train_hot)
# Print loss
if log_loss:
print("Loss", loss.item())
# Backward pass
loss.backward()
# Update the weights
optimizer.step()
def save(self, path):
info_path = ".".join(os.path.basename(path).split(".")[:-1]) + "/.info"
torch.save(self.state_dict(), path)
data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
with ZipFile(path, "a") as model_zip:
model_zip.writestr(info_path, data_from_model.save())
model_zip.close()
@staticmethod
def load_from_checkpoint(path, map_location=None):
old = True
with ZipFile(path) as model_zip:
filesMatch = [file for file in model_zip.namelist() if file.endswith("/.info")]
file = filesMatch[0] if filesMatch else None
if file:
old = False
data_from_model = Data.load(model_zip.read(file).decode("utf-8"))
model_zip.close()
if old:
model = HubertTokenizer()
else:
model = HubertTokenizer(
data_from_model.hidden_size,
data_from_model.input_size,
data_from_model.output_size,
data_from_model.version,
)
model.load_state_dict(torch.load(path, map_location=map_location))
if map_location:
model = model.to(map_location)
return model
class Data:
input_size: int
hidden_size: int
output_size: int
version: int
def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.version = version
@staticmethod
def load(string):
data = json.loads(string)
return Data(data["input_size"], data["hidden_size"], data["output_size"], data["version"])
def save(self):
data = {
"input_size": self.input_size,
"hidden_size": self.hidden_size,
"output_size": self.output_size,
"version": self.version,
}
return json.dumps(data)
def auto_train(data_path, save_path="model.pth", load_model: str = None, save_epochs=1):
data_x, data_y = [], []
if load_model and os.path.isfile(load_model):
print("Loading model from", load_model)
model_training = HubertTokenizer.load_from_checkpoint(load_model, "cuda")
else:
print("Creating new model.")
model_training = HubertTokenizer(version=1).to("cuda") # Settings for the model to run without lstm
save_path = os.path.join(data_path, save_path)
base_save_path = ".".join(save_path.split(".")[:-1])
sem_string = "_semantic.npy"
feat_string = "_semantic_features.npy"
ready = os.path.join(data_path, "ready")
for input_file in os.listdir(ready):
full_path = os.path.join(ready, input_file)
if input_file.endswith(sem_string):
data_y.append(numpy.load(full_path))
elif input_file.endswith(feat_string):
data_x.append(numpy.load(full_path))
model_training.prepare_training()
epoch = 1
while 1:
for _ in range(save_epochs):
j = 0
for x, y in zip(data_x, data_y):
model_training.train_step(
torch.tensor(x).to("cuda"), torch.tensor(y).to("cuda"), j % 50 == 0
) # Print loss every 50 steps
j += 1
save_p = save_path
save_p_2 = f"{base_save_path}_epoch_{epoch}.pth"
model_training.save(save_p)
model_training.save(save_p_2)
print(f"Epoch {epoch} completed")
epoch += 1

View File

@ -0,0 +1,606 @@
import logging
import os
import re
from glob import glob
from typing import Dict, List
import librosa
import numpy as np
import torch
import torchaudio
import tqdm
from encodec.utils import convert_audio
from scipy.special import softmax
from torch.nn import functional as F
from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode
logger = logging.getLogger(__name__)
def _tokenize(tokenizer, text):
return tokenizer.encode(text, add_special_tokens=False)
def _detokenize(tokenizer, enc_text):
return tokenizer.decode(enc_text)
def _normalize_whitespace(text):
return re.sub(r"\s+", " ", text).strip()
def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
dirs = extra_voice_dirs
voices: Dict[str, List[str]] = {}
for d in dirs:
subs = os.listdir(d)
for sub in subs:
subj = os.path.join(d, sub)
if os.path.isdir(subj):
voices[sub] = list(glob(f"{subj}/*.npz"))
# fetch audio files if no npz files are found
if len(voices[sub]) == 0:
voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
return voices
def load_npz(npz_file):
x_history = np.load(npz_file)
semantic = x_history["semantic_prompt"]
coarse = x_history["coarse_prompt"]
fine = x_history["fine_prompt"]
return semantic, coarse, fine
def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
if voice == "random":
return None, None, None
voices = get_voices(extra_voice_dirs)
paths = voices[voice]
# bark only uses a single sample for cloning
if len(paths) > 1:
raise ValueError(f"Voice {voice} has multiple paths: {paths}")
try:
path = voices[voice]
except KeyError as e:
raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e
if len(paths) == 1 and paths[0].endswith(".npz"):
return load_npz(path[0])
audio_path = paths[0]
# replace the file extension with .npz
output_path = os.path.splitext(audio_path)[0] + ".npz"
generate_voice(audio=audio_path, model=model, output_path=output_path)
return load_voice(model, voice, extra_voice_dirs)
def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
total_frames = 1 + int((len(audio) - frame_length) / hop_length)
return zero_crossings / total_frames
def compute_spectral_contrast(audio_data, sample_rate, n_bands=6, fmin=200.0):
spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sample_rate, n_bands=n_bands, fmin=fmin)
return np.mean(spectral_contrast)
def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):
stft = librosa.stft(audio_data)
power_spectrogram = np.abs(stft) ** 2
frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=stft.shape[0])
bass_mask = frequencies <= max_bass_freq
bass_energy = power_spectrogram[np.ix_(bass_mask, np.arange(power_spectrogram.shape[1]))].mean()
return bass_energy
def generate_voice(
audio,
model,
output_path,
):
"""Generate a new voice from a given audio and text prompt.
Args:
audio (np.ndarray): The audio to use as a base for the new voice.
text (str): Transcription of the audio you are clonning.
model (BarkModel): The BarkModel to use for generating the new voice.
output_path (str): The path to save the generated voice to.
"""
if isinstance(audio, str):
audio, sr = torchaudio.load(audio)
audio = convert_audio(audio, sr, model.config.sample_rate, model.encodec.channels)
audio = audio.unsqueeze(0).to(model.device)
with torch.no_grad():
encoded_frames = model.encodec.encode(audio)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
# move codes to cpu
codes = codes.cpu().numpy()
# generate semantic tokens
# Load the HuBERT model
hubert_manager = HubertManager()
# hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)
# Load the CustomTokenizer model
tokenizer = HubertTokenizer.load_from_checkpoint(
model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"], map_location=model.device
)
# semantic_tokens = model.text_to_semantic(
# text, max_gen_duration_s=seconds, top_k=50, top_p=0.95, temp=0.7
# ) # not 100%
semantic_vectors = hubert_model.forward(audio[0], input_sample_hz=model.config.sample_rate)
semantic_tokens = tokenizer.get_token(semantic_vectors)
semantic_tokens = semantic_tokens.cpu().numpy()
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
def generate_text_semantic(
text,
model,
history_prompt=None,
temp=0.7,
top_k=None,
top_p=None,
silent=False,
min_eos_p=0.2,
max_gen_duration_s=None,
allow_early_stop=True,
base=None,
use_kv_caching=True,
**kwargs, # pylint: disable=unused-argument
):
"""Generate semantic tokens from text.
Args:
text (str): The text to generate semantic tokens from.
model (BarkModel): The BarkModel to use for generating the semantic tokens.
history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
temp (float): The temperature to use for the generation.
top_k (int): The number of top tokens to consider for the generation.
top_p (float): The cumulative probability to consider for the generation.
silent (bool): Whether to silence the tqdm progress bar.
min_eos_p (float): The minimum probability to consider for the end of sentence token.
max_gen_duration_s (float): The maximum duration in seconds to generate for.
allow_early_stop (bool): Whether to allow the generation to stop early.
base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
use_kv_caching (bool): Whether to use key-value caching for the generation.
**kwargs: Additional keyword arguments. They are ignored.
Returns:
np.ndarray: The generated semantic tokens.
"""
assert isinstance(text, str)
text = _normalize_whitespace(text)
assert len(text.strip()) > 0
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
semantic_history = history_prompt[0]
if base is not None:
semantic_history = base[0]
assert (
isinstance(semantic_history, np.ndarray)
and len(semantic_history.shape) == 1
and len(semantic_history) > 0
and semantic_history.min() >= 0
and semantic_history.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
)
else:
semantic_history = None
encoded_text = np.array(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET
if len(encoded_text) > 256:
p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
logger.warning(f"warning, text too long, lopping of last {p}%")
encoded_text = encoded_text[:256]
encoded_text = np.pad(
encoded_text,
(0, 256 - len(encoded_text)),
constant_values=model.config.TEXT_PAD_TOKEN,
mode="constant",
)
if semantic_history is not None:
semantic_history = semantic_history.astype(np.int64)
# lop off if history is too long, pad if needed
semantic_history = semantic_history[-256:]
semantic_history = np.pad(
semantic_history,
(0, 256 - len(semantic_history)),
constant_values=model.config.SEMANTIC_PAD_TOKEN,
mode="constant",
)
else:
semantic_history = np.array([model.config.SEMANTIC_PAD_TOKEN] * 256)
x = torch.from_numpy(
np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
)[None]
assert x.shape[1] == 256 + 256 + 1
with inference_mode():
x = x.to(model.device)
n_tot_steps = 768
# custom tqdm updates since we don't know when eos will occur
pbar = tqdm.tqdm(disable=silent, total=100)
pbar_state = 0
tot_generated_duration_s = 0
kv_cache = None
for n in range(n_tot_steps):
if use_kv_caching and kv_cache is not None:
x_input = x[:, [-1]]
else:
x_input = x
logits, kv_cache = model.semantic_model(
x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
)
relevant_logits = logits[0, 0, : model.config.SEMANTIC_VOCAB_SIZE]
if allow_early_stop:
relevant_logits = torch.hstack(
(relevant_logits, logits[0, 0, [model.config.SEMANTIC_PAD_TOKEN]])
) # eos
if top_p is not None:
# faster to convert to numpy
logits_device = relevant_logits.device
logits_dtype = relevant_logits.type()
relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
sorted_indices = np.argsort(relevant_logits)[::-1]
sorted_logits = relevant_logits[sorted_indices]
cumulative_probs = np.cumsum(softmax(sorted_logits))
sorted_indices_to_remove = cumulative_probs > top_p
sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
sorted_indices_to_remove[0] = False
relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
relevant_logits = torch.from_numpy(relevant_logits)
relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
if top_k is not None:
v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
relevant_logits[relevant_logits < v[-1]] = -float("Inf")
probs = torch.softmax(relevant_logits / temp, dim=-1)
item_next = torch.multinomial(probs, num_samples=1)
if allow_early_stop and (
item_next == model.config.SEMANTIC_VOCAB_SIZE or (min_eos_p is not None and probs[-1] >= min_eos_p)
):
# eos found, so break
pbar.update(100 - pbar_state)
break
x = torch.cat((x, item_next[None]), dim=1)
tot_generated_duration_s += 1 / model.config.SEMANTIC_RATE_HZ
if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
pbar.update(100 - pbar_state)
break
if n == n_tot_steps - 1:
pbar.update(100 - pbar_state)
break
del logits, relevant_logits, probs, item_next
req_pbar_state = np.min([100, int(round(100 * n / n_tot_steps))])
if req_pbar_state > pbar_state:
pbar.update(req_pbar_state - pbar_state)
pbar_state = req_pbar_state
pbar.close()
out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
clear_cuda_cache()
return out
def _flatten_codebooks(arr, offset_size):
assert len(arr.shape) == 2
arr = arr.copy()
if offset_size is not None:
for n in range(1, arr.shape[0]):
arr[n, :] += offset_size * n
flat_arr = arr.ravel("F")
return flat_arr
def generate_coarse(
x_semantic,
model,
history_prompt=None,
temp=0.7,
top_k=None,
top_p=None,
silent=False,
max_coarse_history=630, # min 60 (faster), max 630 (more context)
sliding_window_len=60,
base=None,
use_kv_caching=True,
):
"""Generate coarse audio codes from semantic tokens.
Args:
x_semantic (np.ndarray): The semantic tokens to generate coarse audio codes from.
model (BarkModel): The BarkModel to use for generating the coarse audio codes.
history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
temp (float): The temperature to use for the generation.
top_k (int): The number of top tokens to consider for the generation.
top_p (float): The cumulative probability to consider for the generation.
silent (bool): Whether to silence the tqdm progress bar.
max_coarse_history (int): The maximum number of coarse audio codes to use as history.
sliding_window_len (int): The length of the sliding window to use for the generation.
base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
use_kv_caching (bool): Whether to use key-value caching for the generation.
Returns:
np.ndarray: The generated coarse audio codes.
"""
assert (
isinstance(x_semantic, np.ndarray)
and len(x_semantic.shape) == 1
and len(x_semantic) > 0
and x_semantic.min() >= 0
and x_semantic.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
)
assert 60 <= max_coarse_history <= 630
assert max_coarse_history + sliding_window_len <= 1024 - 256
semantic_to_coarse_ratio = (
model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
)
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
x_history = history_prompt
x_semantic_history = x_history[0]
x_coarse_history = x_history[1]
if base is not None:
x_semantic_history = base[0]
x_coarse_history = base[1]
assert (
isinstance(x_semantic_history, np.ndarray)
and len(x_semantic_history.shape) == 1
and len(x_semantic_history) > 0
and x_semantic_history.min() >= 0
and x_semantic_history.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
and isinstance(x_coarse_history, np.ndarray)
and len(x_coarse_history.shape) == 2
and x_coarse_history.shape[0] == model.config.N_COARSE_CODEBOOKS
and x_coarse_history.shape[-1] >= 0
and x_coarse_history.min() >= 0
and x_coarse_history.max() <= model.config.CODEBOOK_SIZE - 1
and (
round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
== round(semantic_to_coarse_ratio / model.config.N_COARSE_CODEBOOKS, 1)
)
)
x_coarse_history = (
_flatten_codebooks(x_coarse_history, model.config.CODEBOOK_SIZE) + model.config.SEMANTIC_VOCAB_SIZE
)
# trim histories correctly
n_semantic_hist_provided = np.min(
[
max_semantic_history,
len(x_semantic_history) - len(x_semantic_history) % 2,
int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
]
)
n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32)
x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
# TODO: bit of a hack for time alignment (sounds better)
x_coarse_history = x_coarse_history[:-2]
else:
x_semantic_history = np.array([], dtype=np.int32)
x_coarse_history = np.array([], dtype=np.int32)
# start loop
n_steps = int(
round(
np.floor(len(x_semantic) * semantic_to_coarse_ratio / model.config.N_COARSE_CODEBOOKS)
* model.config.N_COARSE_CODEBOOKS
)
)
assert n_steps > 0 and n_steps % model.config.N_COARSE_CODEBOOKS == 0
x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
x_coarse = x_coarse_history.astype(np.int32)
base_semantic_idx = len(x_semantic_history)
with inference_mode():
x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
n_window_steps = int(np.ceil(n_steps / sliding_window_len))
n_step = 0
for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio))
# pad from right side
x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :]
x_in = x_in[:, :256]
x_in = F.pad(
x_in,
(0, 256 - x_in.shape[-1]),
"constant",
model.config.COARSE_SEMANTIC_PAD_TOKEN,
)
x_in = torch.hstack(
[
x_in,
torch.tensor([model.config.COARSE_INFER_TOKEN])[None].to(model.device),
x_coarse_in[:, -max_coarse_history:],
]
)
kv_cache = None
for _ in range(sliding_window_len):
if n_step >= n_steps:
continue
is_major_step = n_step % model.config.N_COARSE_CODEBOOKS == 0
if use_kv_caching and kv_cache is not None:
x_input = x_in[:, [-1]]
else:
x_input = x_in
logits, kv_cache = model.coarse_model(x_input, use_cache=use_kv_caching, past_kv=kv_cache)
logit_start_idx = (
model.config.SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * model.config.CODEBOOK_SIZE
)
logit_end_idx = model.config.SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * model.config.CODEBOOK_SIZE
relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
if top_p is not None:
# faster to convert to numpy
logits_device = relevant_logits.device
logits_dtype = relevant_logits.type()
relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
sorted_indices = np.argsort(relevant_logits)[::-1]
sorted_logits = relevant_logits[sorted_indices]
cumulative_probs = np.cumsum(torch.nn.functional.softmax(sorted_logits))
sorted_indices_to_remove = cumulative_probs > top_p
sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
sorted_indices_to_remove[0] = False
relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
relevant_logits = torch.from_numpy(relevant_logits)
relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
if top_k is not None:
v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
relevant_logits[relevant_logits < v[-1]] = -float("Inf")
probs = torch.nn.functional.softmax(relevant_logits / temp, dim=-1)
item_next = torch.multinomial(probs, num_samples=1)
item_next += logit_start_idx
x_coarse_in = torch.cat((x_coarse_in, item_next[None]), dim=1)
x_in = torch.cat((x_in, item_next[None]), dim=1)
del logits, relevant_logits, probs, item_next
n_step += 1
del x_in
del x_semantic_in
gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :]
del x_coarse_in
assert len(gen_coarse_arr) == n_steps
gen_coarse_audio_arr = (
gen_coarse_arr.reshape(-1, model.config.N_COARSE_CODEBOOKS).T - model.config.SEMANTIC_VOCAB_SIZE
)
for n in range(1, model.config.N_COARSE_CODEBOOKS):
gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
clear_cuda_cache()
return gen_coarse_audio_arr
def generate_fine(
x_coarse_gen,
model,
history_prompt=None,
temp=0.5,
silent=True,
base=None,
):
"""Generate full audio codes from coarse audio codes.
Args:
x_coarse_gen (np.ndarray): The coarse audio codes to generate full audio codes from.
model (BarkModel): The BarkModel to use for generating the full audio codes.
history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
temp (float): The temperature to use for the generation.
silent (bool): Whether to silence the tqdm progress bar.
base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
Returns:
np.ndarray: The generated full audio codes.
"""
assert (
isinstance(x_coarse_gen, np.ndarray)
and len(x_coarse_gen.shape) == 2
and 1 <= x_coarse_gen.shape[0] <= model.config.N_FINE_CODEBOOKS - 1
and x_coarse_gen.shape[1] > 0
and x_coarse_gen.min() >= 0
and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
)
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
x_fine_history = history_prompt[2]
if base is not None:
x_fine_history = base[2]
assert (
isinstance(x_fine_history, np.ndarray)
and len(x_fine_history.shape) == 2
and x_fine_history.shape[0] == model.config.N_FINE_CODEBOOKS
and x_fine_history.shape[1] >= 0
and x_fine_history.min() >= 0
and x_fine_history.max() <= model.config.CODEBOOK_SIZE - 1
)
else:
x_fine_history = None
n_coarse = x_coarse_gen.shape[0]
# make input arr
in_arr = np.vstack(
[
x_coarse_gen,
np.zeros((model.config.N_FINE_CODEBOOKS - n_coarse, x_coarse_gen.shape[1]))
+ model.config.CODEBOOK_SIZE, # padding
]
).astype(np.int32)
# prepend history if available (max 512)
if x_fine_history is not None:
x_fine_history = x_fine_history.astype(np.int32)
in_arr = np.hstack(
[
x_fine_history[:, -512:].astype(np.int32),
in_arr,
]
)
n_history = x_fine_history[:, -512:].shape[1]
else:
n_history = 0
n_remove_from_end = 0
# need to pad if too short (since non-causal model)
if in_arr.shape[1] < 1024:
n_remove_from_end = 1024 - in_arr.shape[1]
in_arr = np.hstack(
[
in_arr,
np.zeros((model.config.N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32)
+ model.config.CODEBOOK_SIZE,
]
)
# we can be lazy about fractional loop and just keep overwriting codebooks
n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
with inference_mode():
in_arr = torch.tensor(in_arr.T).to(model.device)
for n in tqdm.tqdm(range(n_loops), disable=silent):
start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512])
rel_start_fill_idx = start_fill_idx - start_idx
in_buffer = in_arr[start_idx : start_idx + 1024, :][None]
for nn in range(n_coarse, model.config.N_FINE_CODEBOOKS):
logits = model.fine_model(nn, in_buffer)
if temp is None:
relevant_logits = logits[0, rel_start_fill_idx:, : model.config.CODEBOOK_SIZE]
codebook_preds = torch.argmax(relevant_logits, -1)
else:
relevant_logits = logits[0, :, : model.config.CODEBOOK_SIZE] / temp
probs = F.softmax(relevant_logits, dim=-1)
codebook_preds = torch.hstack(
[torch.multinomial(probs[n], num_samples=1) for n in range(rel_start_fill_idx, 1024)]
)
in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds
del logits, codebook_preds
# transfer over info into model_in and convert to numpy
for nn in range(n_coarse, model.config.N_FINE_CODEBOOKS):
in_arr[start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn] = in_buffer[
0, rel_start_fill_idx:, nn
]
del in_buffer
gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T
del in_arr
gen_fine_arr = gen_fine_arr[:, n_history:]
if n_remove_from_end > 0:
gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
clear_cuda_cache()
return gen_fine_arr
def codec_decode(fine_tokens, model):
"""Turn quantized audio codes into audio array using encodec."""
arr = torch.from_numpy(fine_tokens)[None]
arr = arr.to(model.device)
arr = arr.transpose(0, 1)
emb = model.encodec.quantizer.decode(arr)
out = model.encodec.decoder(emb)
audio_arr = out.detach().cpu().numpy().squeeze()
return audio_arr

View File

@ -0,0 +1,160 @@
import contextlib
import functools
import hashlib
import logging
import os
import requests
import torch
import tqdm
from TTS.tts.layers.bark.model import GPT, GPTConfig
from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
if (
torch.cuda.is_available()
and hasattr(torch.cuda, "amp")
and hasattr(torch.cuda.amp, "autocast")
and torch.cuda.is_bf16_supported()
):
autocast = functools.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
else:
@contextlib.contextmanager
def autocast():
yield
# hold models in global scope to lazy load
logger = logging.getLogger(__name__)
if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
logger.warning(
"torch version does not support flash attention. You will get significantly faster"
+ " inference speed by upgrade torch to newest version / nightly."
)
def _md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def _download(from_s3_path, to_local_path, CACHE_DIR):
os.makedirs(CACHE_DIR, exist_ok=True)
response = requests.get(from_s3_path, stream=True)
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 1024 # 1 Kibibyte
progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(to_local_path, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes not in [0, progress_bar.n]:
raise ValueError("ERROR, something went wrong")
class InferenceContext:
def __init__(self, benchmark=False):
# we can't expect inputs to be the same length, so disable benchmarking by default
self._chosen_cudnn_benchmark = benchmark
self._cudnn_benchmark = None
def __enter__(self):
self._cudnn_benchmark = torch.backends.cudnn.benchmark
torch.backends.cudnn.benchmark = self._chosen_cudnn_benchmark
def __exit__(self, exc_type, exc_value, exc_traceback):
torch.backends.cudnn.benchmark = self._cudnn_benchmark
if torch.cuda.is_available():
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
@contextlib.contextmanager
def inference_mode():
with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
yield
def clear_cuda_cache():
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
def load_model(ckpt_path, device, config, model_type="text"):
logger.info(f"loading {model_type} model from {ckpt_path}...")
if device == "cpu":
logger.warning("No GPU being used. Careful, Inference might be extremely slow!")
if model_type == "text":
ConfigClass = GPTConfig
ModelClass = GPT
elif model_type == "coarse":
ConfigClass = GPTConfig
ModelClass = GPT
elif model_type == "fine":
ConfigClass = FineGPTConfig
ModelClass = FineGPT
else:
raise NotImplementedError()
if (
not config.USE_SMALLER_MODELS
and os.path.exists(ckpt_path)
and _md5(ckpt_path) != config.REMOTE_MODEL_PATHS[model_type]["checksum"]
):
logger.warning(f"found outdated {model_type} model, removing...")
os.remove(ckpt_path)
if not os.path.exists(ckpt_path):
logger.info(f"{model_type} model not found, downloading...")
_download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR)
checkpoint = torch.load(ckpt_path, map_location=device)
# this is a hack
model_args = checkpoint["model_args"]
if "input_vocab_size" not in model_args:
model_args["input_vocab_size"] = model_args["vocab_size"]
model_args["output_vocab_size"] = model_args["vocab_size"]
del model_args["vocab_size"]
gptconf = ConfigClass(**checkpoint["model_args"])
if model_type == "text":
config.semantic_config = gptconf
elif model_type == "coarse":
config.coarse_config = gptconf
elif model_type == "fine":
config.fine_config = gptconf
model = ModelClass(gptconf)
state_dict = checkpoint["model"]
# fixup checkpoint
unwanted_prefix = "_orig_mod."
for k, _ in list(state_dict.items()):
if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
if len(extra_keys) != 0:
raise ValueError(f"extra keys found: {extra_keys}")
if len(missing_keys) != 0:
raise ValueError(f"missing keys: {missing_keys}")
model.load_state_dict(state_dict, strict=False)
n_params = model.get_num_params()
val_loss = checkpoint["best_val_loss"].item()
logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
model.eval()
model.to(device)
del checkpoint, state_dict
clear_cuda_cache()
return model, config

View File

@ -0,0 +1,233 @@
"""
Much of this code is adapted from Andrej Karpathy's NanoGPT
(https://github.com/karpathy/nanoGPT)
"""
import math
from dataclasses import dataclass
import torch
from coqpit import Coqpit
from torch import nn
from torch.nn import functional as F
class LayerNorm(nn.Module):
"""LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
def __init__(self, ndim, bias):
super().__init__()
self.weight = nn.Parameter(torch.ones(ndim))
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
def forward(self, x):
return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
class CausalSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
assert config.n_embd % config.n_head == 0
# key, query, value projections for all heads, but in a batch
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
# output projection
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
# regularization
self.attn_dropout = nn.Dropout(config.dropout)
self.resid_dropout = nn.Dropout(config.dropout)
self.n_head = config.n_head
self.n_embd = config.n_embd
self.dropout = config.dropout
# flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
if not self.flash:
# print("WARNING: using slow attention. Flash Attention atm needs PyTorch nightly and dropout=0.0")
# causal mask to ensure that attention is only applied to the left in the input sequence
self.register_buffer(
"bias",
torch.tril(torch.ones(config.block_size, config.block_size)).view(
1, 1, config.block_size, config.block_size
),
)
def forward(self, x, past_kv=None, use_cache=False):
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
if past_kv is not None:
past_key = past_kv[0]
past_value = past_kv[1]
k = torch.cat((past_key, k), dim=-2)
v = torch.cat((past_value, v), dim=-2)
FULL_T = k.shape[-2]
if use_cache is True:
present = (k, v)
else:
present = None
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
if self.flash:
# efficient attention using Flash Attention CUDA kernels
if past_kv is not None:
# When `past_kv` is provided, we're doing incremental decoding and `q.shape[2] == 1`: q only contains
# the query for the last token. scaled_dot_product_attention interprets this as the first token in the
# sequence, so if is_causal=True it will mask out all attention from it. This is not what we want, so
# to work around this we set is_causal=False.
is_causal = False
else:
is_causal = True
# efficient attention using Flash Attention CUDA kernels
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout, is_causal=is_causal)
else:
# manual implementation of attention
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
att = att.masked_fill(self.bias[:, :, FULL_T - T : FULL_T, :FULL_T] == 0, float("-inf"))
att = F.softmax(att, dim=-1)
att = self.attn_dropout(att)
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
# output projection
y = self.resid_dropout(self.c_proj(y))
return (y, present)
class MLP(nn.Module):
def __init__(self, config):
super().__init__()
self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
self.dropout = nn.Dropout(config.dropout)
self.gelu = nn.GELU()
def forward(self, x):
x = self.c_fc(x)
x = self.gelu(x)
x = self.c_proj(x)
x = self.dropout(x)
return x
class Block(nn.Module):
def __init__(self, config, layer_idx):
super().__init__()
self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
self.attn = CausalSelfAttention(config)
self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
self.mlp = MLP(config)
self.layer_idx = layer_idx
def forward(self, x, past_kv=None, use_cache=False):
attn_output, prev_kvs = self.attn(self.ln_1(x), past_kv=past_kv, use_cache=use_cache)
x = x + attn_output
x = x + self.mlp(self.ln_2(x))
return (x, prev_kvs)
@dataclass
class GPTConfig(Coqpit):
block_size: int = 1024
input_vocab_size: int = 10_048
output_vocab_size: int = 10_048
n_layer: int = 12
n_head: int = 12
n_embd: int = 768
dropout: float = 0.0
bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
class GPT(nn.Module):
def __init__(self, config):
super().__init__()
assert config.input_vocab_size is not None
assert config.output_vocab_size is not None
assert config.block_size is not None
self.config = config
self.transformer = nn.ModuleDict(
dict(
wte=nn.Embedding(config.input_vocab_size, config.n_embd),
wpe=nn.Embedding(config.block_size, config.n_embd),
drop=nn.Dropout(config.dropout),
h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
ln_f=LayerNorm(config.n_embd, bias=config.bias),
)
)
self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
def get_num_params(self, non_embedding=True):
"""
Return the number of parameters in the model.
For non-embedding count (default), the position embeddings get subtracted.
The token embeddings would too, except due to the parameter sharing these
params are actually used as weights in the final layer, so we include them.
"""
n_params = sum(p.numel() for p in self.parameters())
if non_embedding:
n_params -= self.transformer.wte.weight.numel()
n_params -= self.transformer.wpe.weight.numel()
return n_params
def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
device = idx.device
_, t = idx.size()
if past_kv is not None:
assert t == 1
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
else:
if merge_context:
assert idx.shape[1] >= 256 + 256 + 1
t = idx.shape[1] - 256
else:
assert (
t <= self.config.block_size
), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
# forward the GPT model itself
if merge_context:
tok_emb = torch.cat(
[
self.transformer.wte(idx[:, :256]) + self.transformer.wte(idx[:, 256 : 256 + 256]),
self.transformer.wte(idx[:, 256 + 256 :]),
],
dim=1,
)
else:
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
if past_kv is None:
past_length = 0
past_kv = tuple([None] * len(self.transformer.h))
else:
past_length = past_kv[0][0].size(-2)
if position_ids is None:
position_ids = torch.arange(past_length, t + past_length, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0) # shape (1, t)
assert position_ids.shape == (1, t)
pos_emb = self.transformer.wpe(position_ids) # position embeddings of shape (1, t, n_embd)
x = self.transformer.drop(tok_emb + pos_emb)
new_kv = () if use_cache else None
for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
if use_cache:
new_kv = new_kv + (kv,)
x = self.transformer.ln_f(x)
# inference-time mini-optimization: only forward the lm_head on the very last position
logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
return (logits, new_kv)

View File

@ -0,0 +1,142 @@
"""
Much of this code is adapted from Andrej Karpathy's NanoGPT
(https://github.com/karpathy/nanoGPT)
"""
import math
from dataclasses import dataclass
import torch
from torch import nn
from torch.nn import functional as F
from .model import GPT, MLP, GPTConfig
class NonCausalSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
assert config.n_embd % config.n_head == 0
# key, query, value projections for all heads, but in a batch
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
# output projection
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
# regularization
self.attn_dropout = nn.Dropout(config.dropout)
self.resid_dropout = nn.Dropout(config.dropout)
self.n_head = config.n_head
self.n_embd = config.n_embd
self.dropout = config.dropout
# flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention") and self.dropout == 0.0
def forward(self, x):
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
if self.flash:
# efficient attention using Flash Attention CUDA kernels
y = torch.nn.functional.scaled_dot_product_attention(
q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=False
)
else:
# manual implementation of attention
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
att = F.softmax(att, dim=-1)
att = self.attn_dropout(att)
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
# output projection
y = self.resid_dropout(self.c_proj(y))
return y
class FineBlock(nn.Module):
def __init__(self, config):
super().__init__()
self.ln_1 = nn.LayerNorm(config.n_embd)
self.attn = NonCausalSelfAttention(config)
self.ln_2 = nn.LayerNorm(config.n_embd)
self.mlp = MLP(config)
def forward(self, x):
x = x + self.attn(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x
class FineGPT(GPT):
def __init__(self, config):
super().__init__(config)
del self.lm_head
self.config = config
self.n_codes_total = config.n_codes_total
self.transformer = nn.ModuleDict(
dict(
wtes=nn.ModuleList(
[nn.Embedding(config.input_vocab_size, config.n_embd) for _ in range(config.n_codes_total)]
),
wpe=nn.Embedding(config.block_size, config.n_embd),
drop=nn.Dropout(config.dropout),
h=nn.ModuleList([FineBlock(config) for _ in range(config.n_layer)]),
ln_f=nn.LayerNorm(config.n_embd),
)
)
self.lm_heads = nn.ModuleList(
[
nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
for _ in range(config.n_codes_given, self.n_codes_total)
]
)
for i in range(self.n_codes_total - config.n_codes_given):
self.transformer.wtes[i + 1].weight = self.lm_heads[i].weight
def forward(self, pred_idx, idx):
device = idx.device
b, t, codes = idx.size()
assert (
t <= self.config.block_size
), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
assert pred_idx > 0, "cannot predict 0th codebook"
assert codes == self.n_codes_total, (b, t, codes)
pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
# forward the GPT model itself
tok_embs = [
wte(idx[:, :, i]).unsqueeze(-1) for i, wte in enumerate(self.transformer.wtes)
] # token embeddings of shape (b, t, n_embd)
tok_emb = torch.cat(tok_embs, dim=-1)
pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
x = tok_emb[:, :, :, : pred_idx + 1].sum(dim=-1)
x = self.transformer.drop(x + pos_emb)
for block in self.transformer.h:
x = block(x)
x = self.transformer.ln_f(x)
logits = self.lm_heads[pred_idx - self.config.n_codes_given](x)
return logits
def get_num_params(self, non_embedding=True):
"""
Return the number of parameters in the model.
For non-embedding count (default), the position embeddings get subtracted.
The token embeddings would too, except due to the parameter sharing these
params are actually used as weights in the final layer, so we include them.
"""
n_params = sum(p.numel() for p in self.parameters())
if non_embedding:
for wte in self.transformer.wtes:
n_params -= wte.weight.numel()
n_params -= self.transformer.wpe.weight.numel()
return n_params
@dataclass
class FineGPTConfig(GPTConfig):
n_codes_total: int = 8
n_codes_given: int = 1

View File

@ -0,0 +1,563 @@
### credit: https://github.com/dunky11/voicesmith
from typing import Callable, Dict, Tuple
import torch
import torch.nn.functional as F
from coqpit import Coqpit
from torch import nn
from TTS.tts.layers.delightful_tts.conformer import Conformer
from TTS.tts.layers.delightful_tts.encoders import (
PhonemeLevelProsodyEncoder,
UtteranceLevelProsodyEncoder,
get_mask_from_lengths,
)
from TTS.tts.layers.delightful_tts.energy_adaptor import EnergyAdaptor
from TTS.tts.layers.delightful_tts.networks import EmbeddingPadded, positional_encoding
from TTS.tts.layers.delightful_tts.phoneme_prosody_predictor import PhonemeProsodyPredictor
from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor
from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
from TTS.tts.layers.generic.aligner import AlignmentNetwork
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
class AcousticModel(torch.nn.Module):
def __init__(
self,
args: "ModelArgs",
tokenizer: "TTSTokenizer" = None,
speaker_manager: "SpeakerManager" = None,
):
super().__init__()
self.args = args
self.tokenizer = tokenizer
self.speaker_manager = speaker_manager
self.init_multispeaker(args)
# self.set_embedding_dims()
self.length_scale = (
float(self.args.length_scale) if isinstance(self.args.length_scale, int) else self.args.length_scale
)
self.emb_dim = args.n_hidden_conformer_encoder
self.encoder = Conformer(
dim=self.args.n_hidden_conformer_encoder,
n_layers=self.args.n_layers_conformer_encoder,
n_heads=self.args.n_heads_conformer_encoder,
speaker_embedding_dim=self.embedded_speaker_dim,
p_dropout=self.args.dropout_conformer_encoder,
kernel_size_conv_mod=self.args.kernel_size_conv_mod_conformer_encoder,
lrelu_slope=self.args.lrelu_slope,
)
self.pitch_adaptor = PitchAdaptor(
n_input=self.args.n_hidden_conformer_encoder,
n_hidden=self.args.n_hidden_variance_adaptor,
n_out=1,
kernel_size=self.args.kernel_size_variance_adaptor,
emb_kernel_size=self.args.emb_kernel_size_variance_adaptor,
p_dropout=self.args.dropout_variance_adaptor,
lrelu_slope=self.args.lrelu_slope,
)
self.energy_adaptor = EnergyAdaptor(
channels_in=self.args.n_hidden_conformer_encoder,
channels_hidden=self.args.n_hidden_variance_adaptor,
channels_out=1,
kernel_size=self.args.kernel_size_variance_adaptor,
emb_kernel_size=self.args.emb_kernel_size_variance_adaptor,
dropout=self.args.dropout_variance_adaptor,
lrelu_slope=self.args.lrelu_slope,
)
self.aligner = AlignmentNetwork(
in_query_channels=self.args.out_channels,
in_key_channels=self.args.n_hidden_conformer_encoder,
)
self.duration_predictor = VariancePredictor(
channels_in=self.args.n_hidden_conformer_encoder,
channels=self.args.n_hidden_variance_adaptor,
channels_out=1,
kernel_size=self.args.kernel_size_variance_adaptor,
p_dropout=self.args.dropout_variance_adaptor,
lrelu_slope=self.args.lrelu_slope,
)
self.utterance_prosody_encoder = UtteranceLevelProsodyEncoder(
num_mels=self.args.num_mels,
ref_enc_filters=self.args.ref_enc_filters_reference_encoder,
ref_enc_size=self.args.ref_enc_size_reference_encoder,
ref_enc_gru_size=self.args.ref_enc_gru_size_reference_encoder,
ref_enc_strides=self.args.ref_enc_strides_reference_encoder,
n_hidden=self.args.n_hidden_conformer_encoder,
dropout=self.args.dropout_conformer_encoder,
bottleneck_size_u=self.args.bottleneck_size_u_reference_encoder,
token_num=self.args.token_num_reference_encoder,
)
self.utterance_prosody_predictor = PhonemeProsodyPredictor(
hidden_size=self.args.n_hidden_conformer_encoder,
kernel_size=self.args.predictor_kernel_size_reference_encoder,
dropout=self.args.dropout_conformer_encoder,
bottleneck_size=self.args.bottleneck_size_u_reference_encoder,
lrelu_slope=self.args.lrelu_slope,
)
self.phoneme_prosody_encoder = PhonemeLevelProsodyEncoder(
num_mels=self.args.num_mels,
ref_enc_filters=self.args.ref_enc_filters_reference_encoder,
ref_enc_size=self.args.ref_enc_size_reference_encoder,
ref_enc_gru_size=self.args.ref_enc_gru_size_reference_encoder,
ref_enc_strides=self.args.ref_enc_strides_reference_encoder,
n_hidden=self.args.n_hidden_conformer_encoder,
dropout=self.args.dropout_conformer_encoder,
bottleneck_size_p=self.args.bottleneck_size_p_reference_encoder,
n_heads=self.args.n_heads_conformer_encoder,
)
self.phoneme_prosody_predictor = PhonemeProsodyPredictor(
hidden_size=self.args.n_hidden_conformer_encoder,
kernel_size=self.args.predictor_kernel_size_reference_encoder,
dropout=self.args.dropout_conformer_encoder,
bottleneck_size=self.args.bottleneck_size_p_reference_encoder,
lrelu_slope=self.args.lrelu_slope,
)
self.u_bottle_out = nn.Linear(
self.args.bottleneck_size_u_reference_encoder,
self.args.n_hidden_conformer_encoder,
)
self.u_norm = nn.InstanceNorm1d(self.args.bottleneck_size_u_reference_encoder)
self.p_bottle_out = nn.Linear(
self.args.bottleneck_size_p_reference_encoder,
self.args.n_hidden_conformer_encoder,
)
self.p_norm = nn.InstanceNorm1d(
self.args.bottleneck_size_p_reference_encoder,
)
self.decoder = Conformer(
dim=self.args.n_hidden_conformer_decoder,
n_layers=self.args.n_layers_conformer_decoder,
n_heads=self.args.n_heads_conformer_decoder,
speaker_embedding_dim=self.embedded_speaker_dim,
p_dropout=self.args.dropout_conformer_decoder,
kernel_size_conv_mod=self.args.kernel_size_conv_mod_conformer_decoder,
lrelu_slope=self.args.lrelu_slope,
)
padding_idx = self.tokenizer.characters.pad_id
self.src_word_emb = EmbeddingPadded(
self.args.num_chars, self.args.n_hidden_conformer_encoder, padding_idx=padding_idx
)
self.to_mel = nn.Linear(
self.args.n_hidden_conformer_decoder,
self.args.num_mels,
)
self.energy_scaler = torch.nn.BatchNorm1d(1, affine=False, track_running_stats=True, momentum=None)
self.energy_scaler.requires_grad_(False)
def init_multispeaker(self, args: Coqpit): # pylint: disable=unused-argument
"""Init for multi-speaker training."""
self.embedded_speaker_dim = 0
self.num_speakers = self.args.num_speakers
self.audio_transform = None
if self.speaker_manager:
self.num_speakers = self.speaker_manager.num_speakers
if self.args.use_speaker_embedding:
self._init_speaker_embedding()
if self.args.use_d_vector_file:
self._init_d_vector()
@staticmethod
def _set_cond_input(aux_input: Dict):
"""Set the speaker conditioning input based on the multi-speaker mode."""
sid, g, lid, durations = None, None, None, None
if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
sid = aux_input["speaker_ids"]
if sid.ndim == 0:
sid = sid.unsqueeze_(0)
if "d_vectors" in aux_input and aux_input["d_vectors"] is not None:
g = F.normalize(aux_input["d_vectors"]) # .unsqueeze_(-1)
if g.ndim == 2:
g = g # .unsqueeze_(0) # pylint: disable=self-assigning-variable
if "durations" in aux_input and aux_input["durations"] is not None:
durations = aux_input["durations"]
return sid, g, lid, durations
def get_aux_input(self, aux_input: Dict):
sid, g, lid, _ = self._set_cond_input(aux_input)
return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
def _set_speaker_input(self, aux_input: Dict):
d_vectors = aux_input.get("d_vectors", None)
speaker_ids = aux_input.get("speaker_ids", None)
if d_vectors is not None and speaker_ids is not None:
raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")
if speaker_ids is not None and not hasattr(self, "emb_g"):
raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")
g = speaker_ids if speaker_ids is not None else d_vectors
return g
# def set_embedding_dims(self):
# if self.embedded_speaker_dim > 0:
# self.embedding_dims = self.embedded_speaker_dim
# else:
# self.embedding_dims = 0
def _init_speaker_embedding(self):
# pylint: disable=attribute-defined-outside-init
if self.num_speakers > 0:
print(" > initialization of speaker-embedding layers.")
self.embedded_speaker_dim = self.args.speaker_embedding_channels
self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
def _init_d_vector(self):
# pylint: disable=attribute-defined-outside-init
if hasattr(self, "emb_g"):
raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.")
self.embedded_speaker_dim = self.args.d_vector_dim
@staticmethod
def generate_attn(dr, x_mask, y_mask=None):
"""Generate an attention mask from the linear scale durations.
Args:
dr (Tensor): Linear scale durations.
x_mask (Tensor): Mask for the input (character) sequence.
y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations
if None. Defaults to None.
Shapes
- dr: :math:`(B, T_{en})`
- x_mask: :math:`(B, T_{en})`
- y_mask: :math:`(B, T_{de})`
"""
# compute decode mask from the durations
if y_mask is None:
y_lengths = dr.sum(1).long()
y_lengths[y_lengths < 1] = 1
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
return attn
def _expand_encoder_with_durations(
self,
o_en: torch.FloatTensor,
dr: torch.IntTensor,
x_mask: torch.IntTensor,
y_lengths: torch.IntTensor,
):
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype)
attn = self.generate_attn(dr, x_mask, y_mask)
o_en_ex = torch.einsum("kmn, kjm -> kjn", [attn.float(), o_en])
return y_mask, o_en_ex, attn.transpose(1, 2)
def _forward_aligner(
self,
x: torch.FloatTensor,
y: torch.FloatTensor,
x_mask: torch.IntTensor,
y_mask: torch.IntTensor,
attn_priors: torch.FloatTensor,
) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
"""Aligner forward pass.
1. Compute a mask to apply to the attention map.
2. Run the alignment network.
3. Apply MAS to compute the hard alignment map.
4. Compute the durations from the hard alignment map.
Args:
x (torch.FloatTensor): Input sequence.
y (torch.FloatTensor): Output sequence.
x_mask (torch.IntTensor): Input sequence mask.
y_mask (torch.IntTensor): Output sequence mask.
attn_priors (torch.FloatTensor): Prior for the aligner network map.
Returns:
Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
Durations from the hard alignment map, soft alignment potentials, log scale alignment potentials,
hard alignment map.
Shapes:
- x: :math:`[B, T_en, C_en]`
- y: :math:`[B, T_de, C_de]`
- x_mask: :math:`[B, 1, T_en]`
- y_mask: :math:`[B, 1, T_de]`
- attn_priors: :math:`[B, T_de, T_en]`
- aligner_durations: :math:`[B, T_en]`
- aligner_soft: :math:`[B, T_de, T_en]`
- aligner_logprob: :math:`[B, 1, T_de, T_en]`
- aligner_mas: :math:`[B, T_de, T_en]`
"""
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # [B, 1, T_en, T_de]
aligner_soft, aligner_logprob = self.aligner(y.transpose(1, 2), x.transpose(1, 2), x_mask, attn_priors)
aligner_mas = maximum_path(
aligner_soft.squeeze(1).transpose(1, 2).contiguous(), attn_mask.squeeze(1).contiguous()
)
aligner_durations = torch.sum(aligner_mas, -1).int()
aligner_soft = aligner_soft.squeeze(1) # [B, T_max2, T_max]
aligner_mas = aligner_mas.transpose(1, 2) # [B, T_max, T_max2] -> [B, T_max2, T_max]
return aligner_durations, aligner_soft, aligner_logprob, aligner_mas
def average_utterance_prosody( # pylint: disable=no-self-use
self, u_prosody_pred: torch.Tensor, src_mask: torch.Tensor
) -> torch.Tensor:
lengths = ((~src_mask) * 1.0).sum(1)
u_prosody_pred = u_prosody_pred.sum(1, keepdim=True) / lengths.view(-1, 1, 1)
return u_prosody_pred
def forward(
self,
tokens: torch.Tensor,
src_lens: torch.Tensor,
mels: torch.Tensor,
mel_lens: torch.Tensor,
pitches: torch.Tensor,
energies: torch.Tensor,
attn_priors: torch.Tensor,
use_ground_truth: bool = True,
d_vectors: torch.Tensor = None,
speaker_idx: torch.Tensor = None,
) -> Dict[str, torch.Tensor]:
sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable
{"d_vectors": d_vectors, "speaker_ids": speaker_idx}
) # pylint: disable=unused-variable
src_mask = get_mask_from_lengths(src_lens) # [B, T_src]
mel_mask = get_mask_from_lengths(mel_lens) # [B, T_mel]
# Token embeddings
token_embeddings = self.src_word_emb(tokens) # [B, T_src, C_hidden]
token_embeddings = token_embeddings.masked_fill(src_mask.unsqueeze(-1), 0.0)
# Alignment network and durations
aligner_durations, aligner_soft, aligner_logprob, aligner_mas = self._forward_aligner(
x=token_embeddings,
y=mels.transpose(1, 2),
x_mask=~src_mask[:, None],
y_mask=~mel_mask[:, None],
attn_priors=attn_priors,
)
dr = aligner_durations # [B, T_en]
# Embeddings
speaker_embedding = None
if d_vectors is not None:
speaker_embedding = g
elif speaker_idx is not None:
speaker_embedding = F.normalize(self.emb_g(sid))
pos_encoding = positional_encoding(
self.emb_dim,
max(token_embeddings.shape[1], max(mel_lens)),
device=token_embeddings.device,
)
encoder_outputs = self.encoder(
token_embeddings,
src_mask,
speaker_embedding=speaker_embedding,
encoding=pos_encoding,
)
u_prosody_ref = self.u_norm(self.utterance_prosody_encoder(mels=mels, mel_lens=mel_lens))
u_prosody_pred = self.u_norm(
self.average_utterance_prosody(
u_prosody_pred=self.utterance_prosody_predictor(x=encoder_outputs, mask=src_mask),
src_mask=src_mask,
)
)
if use_ground_truth:
encoder_outputs = encoder_outputs + self.u_bottle_out(u_prosody_ref)
else:
encoder_outputs = encoder_outputs + self.u_bottle_out(u_prosody_pred)
p_prosody_ref = self.p_norm(
self.phoneme_prosody_encoder(
x=encoder_outputs, src_mask=src_mask, mels=mels, mel_lens=mel_lens, encoding=pos_encoding
)
)
p_prosody_pred = self.p_norm(self.phoneme_prosody_predictor(x=encoder_outputs, mask=src_mask))
if use_ground_truth:
encoder_outputs = encoder_outputs + self.p_bottle_out(p_prosody_ref)
else:
encoder_outputs = encoder_outputs + self.p_bottle_out(p_prosody_pred)
encoder_outputs_res = encoder_outputs
pitch_pred, avg_pitch_target, pitch_emb = self.pitch_adaptor.get_pitch_embedding_train(
x=encoder_outputs,
target=pitches,
dr=dr,
mask=src_mask,
)
energy_pred, avg_energy_target, energy_emb = self.energy_adaptor.get_energy_embedding_train(
x=encoder_outputs,
target=energies,
dr=dr,
mask=src_mask,
)
encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb + energy_emb
log_duration_prediction = self.duration_predictor(x=encoder_outputs_res.detach(), mask=src_mask)
mel_pred_mask, encoder_outputs_ex, alignments = self._expand_encoder_with_durations(
o_en=encoder_outputs, y_lengths=mel_lens, dr=dr, x_mask=~src_mask[:, None]
)
x = self.decoder(
encoder_outputs_ex.transpose(1, 2),
mel_mask,
speaker_embedding=speaker_embedding,
encoding=pos_encoding,
)
x = self.to_mel(x)
dr = torch.log(dr + 1)
dr_pred = torch.exp(log_duration_prediction) - 1
alignments_dp = self.generate_attn(dr_pred, src_mask.unsqueeze(1), mel_pred_mask) # [B, T_max, T_max2']
return {
"model_outputs": x,
"pitch_pred": pitch_pred,
"pitch_target": avg_pitch_target,
"energy_pred": energy_pred,
"energy_target": avg_energy_target,
"u_prosody_pred": u_prosody_pred,
"u_prosody_ref": u_prosody_ref,
"p_prosody_pred": p_prosody_pred,
"p_prosody_ref": p_prosody_ref,
"alignments_dp": alignments_dp,
"alignments": alignments, # [B, T_de, T_en]
"aligner_soft": aligner_soft,
"aligner_mas": aligner_mas,
"aligner_durations": aligner_durations,
"aligner_logprob": aligner_logprob,
"dr_log_pred": log_duration_prediction.squeeze(1), # [B, T]
"dr_log_target": dr.squeeze(1), # [B, T]
"spk_emb": speaker_embedding,
}
@torch.no_grad()
def inference(
self,
tokens: torch.Tensor,
speaker_idx: torch.Tensor,
p_control: float = None, # TODO # pylint: disable=unused-argument
d_control: float = None, # TODO # pylint: disable=unused-argument
d_vectors: torch.Tensor = None,
pitch_transform: Callable = None,
energy_transform: Callable = None,
) -> torch.Tensor:
src_mask = get_mask_from_lengths(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device))
src_lens = torch.tensor(tokens.shape[1:2]).to(tokens.device) # pylint: disable=unused-variable
sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable
{"d_vectors": d_vectors, "speaker_ids": speaker_idx}
) # pylint: disable=unused-variable
token_embeddings = self.src_word_emb(tokens)
token_embeddings = token_embeddings.masked_fill(src_mask.unsqueeze(-1), 0.0)
# Embeddings
speaker_embedding = None
if d_vectors is not None:
speaker_embedding = g
elif speaker_idx is not None:
speaker_embedding = F.normalize(self.emb_g(sid))
pos_encoding = positional_encoding(
self.emb_dim,
token_embeddings.shape[1],
device=token_embeddings.device,
)
encoder_outputs = self.encoder(
token_embeddings,
src_mask,
speaker_embedding=speaker_embedding,
encoding=pos_encoding,
)
u_prosody_pred = self.u_norm(
self.average_utterance_prosody(
u_prosody_pred=self.utterance_prosody_predictor(x=encoder_outputs, mask=src_mask),
src_mask=src_mask,
)
)
encoder_outputs = encoder_outputs + self.u_bottle_out(u_prosody_pred).expand_as(encoder_outputs)
p_prosody_pred = self.p_norm(
self.phoneme_prosody_predictor(
x=encoder_outputs,
mask=src_mask,
)
)
encoder_outputs = encoder_outputs + self.p_bottle_out(p_prosody_pred).expand_as(encoder_outputs)
encoder_outputs_res = encoder_outputs
pitch_emb_pred, pitch_pred = self.pitch_adaptor.get_pitch_embedding(
x=encoder_outputs,
mask=src_mask,
pitch_transform=pitch_transform,
pitch_mean=self.pitch_mean if hasattr(self, "pitch_mean") else None,
pitch_std=self.pitch_std if hasattr(self, "pitch_std") else None,
)
energy_emb_pred, energy_pred = self.energy_adaptor.get_energy_embedding(
x=encoder_outputs, mask=src_mask, energy_transform=energy_transform
)
encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb_pred + energy_emb_pred
log_duration_pred = self.duration_predictor(
x=encoder_outputs_res.detach(), mask=src_mask
) # [B, C_hidden, T_src] -> [B, T_src]
duration_pred = (torch.exp(log_duration_pred) - 1) * (~src_mask) * self.length_scale # -> [B, T_src]
duration_pred[duration_pred < 1] = 1.0 # -> [B, T_src]
duration_pred = torch.round(duration_pred) # -> [B, T_src]
mel_lens = duration_pred.sum(1) # -> [B,]
_, encoder_outputs_ex, alignments = self._expand_encoder_with_durations(
o_en=encoder_outputs, y_lengths=mel_lens, dr=duration_pred.squeeze(1), x_mask=~src_mask[:, None]
)
mel_mask = get_mask_from_lengths(
torch.tensor([encoder_outputs_ex.shape[2]], dtype=torch.int64, device=encoder_outputs_ex.device)
)
if encoder_outputs_ex.shape[1] > pos_encoding.shape[1]:
encoding = positional_encoding(self.emb_dim, encoder_outputs_ex.shape[2], device=tokens.device)
# [B, C_hidden, T_src], [B, 1, T_src], [B, C_emb], [B, T_src, C_hidden] -> [B, C_hidden, T_src]
x = self.decoder(
encoder_outputs_ex.transpose(1, 2),
mel_mask,
speaker_embedding=speaker_embedding,
encoding=encoding,
)
x = self.to_mel(x)
outputs = {
"model_outputs": x,
"alignments": alignments,
# "pitch": pitch_emb_pred,
"durations": duration_pred,
"pitch": pitch_pred,
"energy": energy_pred,
"spk_emb": speaker_embedding,
}
return outputs

View File

@ -0,0 +1,450 @@
### credit: https://github.com/dunky11/voicesmith
import math
from typing import Tuple
import torch
import torch.nn as nn # pylint: disable=consider-using-from-import
import torch.nn.functional as F
from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d
from TTS.tts.layers.delightful_tts.networks import GLUActivation
def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
pad = kernel_size // 2
return (pad, pad - (kernel_size + 1) % 2)
class Conformer(nn.Module):
def __init__(
self,
dim: int,
n_layers: int,
n_heads: int,
speaker_embedding_dim: int,
p_dropout: float,
kernel_size_conv_mod: int,
lrelu_slope: float,
):
"""
A Transformer variant that integrates both CNNs and Transformers components.
Conformer proposes a novel combination of self-attention and convolution, in which self-attention
learns the global interaction while the convolutions efficiently capture the local correlations.
Args:
dim (int): Number of the dimensions for the model.
n_layers (int): Number of model layers.
n_heads (int): The number of attention heads.
speaker_embedding_dim (int): Number of speaker embedding dimensions.
p_dropout (float): Probabilty of dropout.
kernel_size_conv_mod (int): Size of kernels for convolution modules.
Inputs: inputs, mask
- **inputs** (batch, time, dim): Tensor containing input vector
- **encoding** (batch, time, dim): Positional embedding tensor
- **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
Returns:
- **outputs** (batch, time, dim): Tensor produced by Conformer Encoder.
"""
super().__init__()
d_k = d_v = dim // n_heads
self.layer_stack = nn.ModuleList(
[
ConformerBlock(
dim,
n_heads,
d_k,
d_v,
kernel_size_conv_mod=kernel_size_conv_mod,
dropout=p_dropout,
speaker_embedding_dim=speaker_embedding_dim,
lrelu_slope=lrelu_slope,
)
for _ in range(n_layers)
]
)
def forward(
self,
x: torch.Tensor,
mask: torch.Tensor,
speaker_embedding: torch.Tensor,
encoding: torch.Tensor,
) -> torch.Tensor:
"""
Shapes:
- x: :math:`[B, T_src, C]`
- mask: :math: `[B]`
- speaker_embedding: :math: `[B, C]`
- encoding: :math: `[B, T_max2, C]`
"""
attn_mask = mask.view((mask.shape[0], 1, 1, mask.shape[1]))
for enc_layer in self.layer_stack:
x = enc_layer(
x,
mask=mask,
slf_attn_mask=attn_mask,
speaker_embedding=speaker_embedding,
encoding=encoding,
)
return x
class ConformerBlock(torch.nn.Module):
def __init__(
self,
d_model: int,
n_head: int,
d_k: int, # pylint: disable=unused-argument
d_v: int, # pylint: disable=unused-argument
kernel_size_conv_mod: int,
speaker_embedding_dim: int,
dropout: float,
lrelu_slope: float = 0.3,
):
"""
A Conformer block is composed of four modules stacked together,
A feed-forward module, a self-attention module, a convolution module,
and a second feed-forward module in the end. The block starts with two Feed forward
modules sandwiching the Multi-Headed Self-Attention module and the Conv module.
Args:
d_model (int): The dimension of model
n_head (int): The number of attention heads.
kernel_size_conv_mod (int): Size of kernels for convolution modules.
speaker_embedding_dim (int): Number of speaker embedding dimensions.
emotion_embedding_dim (int): Number of emotion embedding dimensions.
dropout (float): Probabilty of dropout.
Inputs: inputs, mask
- **inputs** (batch, time, dim): Tensor containing input vector
- **encoding** (batch, time, dim): Positional embedding tensor
- **slf_attn_mask** (batch, 1, 1, time1): Tensor containing indices to be masked in self attention module
- **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
Returns:
- **outputs** (batch, time, dim): Tensor produced by the Conformer Block.
"""
super().__init__()
if isinstance(speaker_embedding_dim, int):
self.conditioning = Conv1dGLU(
d_model=d_model,
kernel_size=kernel_size_conv_mod,
padding=kernel_size_conv_mod // 2,
embedding_dim=speaker_embedding_dim,
)
self.ff = FeedForward(d_model=d_model, dropout=dropout, kernel_size=3, lrelu_slope=lrelu_slope)
self.conformer_conv_1 = ConformerConvModule(
d_model, kernel_size=kernel_size_conv_mod, dropout=dropout, lrelu_slope=lrelu_slope
)
self.ln = nn.LayerNorm(d_model)
self.slf_attn = ConformerMultiHeadedSelfAttention(d_model=d_model, num_heads=n_head, dropout_p=dropout)
self.conformer_conv_2 = ConformerConvModule(
d_model, kernel_size=kernel_size_conv_mod, dropout=dropout, lrelu_slope=lrelu_slope
)
def forward(
self,
x: torch.Tensor,
speaker_embedding: torch.Tensor,
mask: torch.Tensor,
slf_attn_mask: torch.Tensor,
encoding: torch.Tensor,
) -> torch.Tensor:
"""
Shapes:
- x: :math:`[B, T_src, C]`
- mask: :math: `[B]`
- slf_attn_mask: :math: `[B, 1, 1, T_src]`
- speaker_embedding: :math: `[B, C]`
- emotion_embedding: :math: `[B, C]`
- encoding: :math: `[B, T_max2, C]`
"""
if speaker_embedding is not None:
x = self.conditioning(x, embeddings=speaker_embedding)
x = self.ff(x) + x
x = self.conformer_conv_1(x) + x
res = x
x = self.ln(x)
x, _ = self.slf_attn(query=x, key=x, value=x, mask=slf_attn_mask, encoding=encoding)
x = x + res
x = x.masked_fill(mask.unsqueeze(-1), 0)
x = self.conformer_conv_2(x) + x
return x
class FeedForward(nn.Module):
def __init__(
self,
d_model: int,
kernel_size: int,
dropout: float,
lrelu_slope: float,
expansion_factor: int = 4,
):
"""
Feed Forward module for conformer block.
Args:
d_model (int): The dimension of model.
kernel_size (int): Size of the kernels for conv layers.
dropout (float): probability of dropout.
expansion_factor (int): The factor by which to project the number of channels.
lrelu_slope (int): the negative slope factor for the leaky relu activation.
Inputs: inputs
- **inputs** (batch, time, dim): Tensor containing input vector
Returns:
- **outputs** (batch, time, dim): Tensor produced by the feed forward module.
"""
super().__init__()
self.dropout = nn.Dropout(dropout)
self.ln = nn.LayerNorm(d_model)
self.conv_1 = nn.Conv1d(
d_model,
d_model * expansion_factor,
kernel_size=kernel_size,
padding=kernel_size // 2,
)
self.act = nn.LeakyReLU(lrelu_slope)
self.conv_2 = nn.Conv1d(d_model * expansion_factor, d_model, kernel_size=1)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Shapes:
x: :math: `[B, T, C]`
"""
x = self.ln(x)
x = x.permute((0, 2, 1))
x = self.conv_1(x)
x = x.permute((0, 2, 1))
x = self.act(x)
x = self.dropout(x)
x = x.permute((0, 2, 1))
x = self.conv_2(x)
x = x.permute((0, 2, 1))
x = self.dropout(x)
x = 0.5 * x
return x
class ConformerConvModule(nn.Module):
def __init__(
self,
d_model: int,
expansion_factor: int = 2,
kernel_size: int = 7,
dropout: float = 0.1,
lrelu_slope: float = 0.3,
):
"""
Convolution module for conformer. Starts with a gating machanism.
a pointwise convolution and a gated linear unit (GLU). This is followed
by a single 1-D depthwise convolution layer. Batchnorm is deployed just after the convolution
to help with training. it also contains an expansion factor to project the number of channels.
Args:
d_model (int): The dimension of model.
expansion_factor (int): The factor by which to project the number of channels.
kernel_size (int): Size of kernels for convolution modules.
dropout (float): Probabilty of dropout.
lrelu_slope (float): The slope coefficient for leaky relu activation.
Inputs: inputs
- **inputs** (batch, time, dim): Tensor containing input vector
Returns:
- **outputs** (batch, time, dim): Tensor produced by the conv module.
"""
super().__init__()
inner_dim = d_model * expansion_factor
self.ln_1 = nn.LayerNorm(d_model)
self.conv_1 = PointwiseConv1d(d_model, inner_dim * 2)
self.conv_act = GLUActivation(slope=lrelu_slope)
self.depthwise = DepthWiseConv1d(
inner_dim,
inner_dim,
kernel_size=kernel_size,
padding=calc_same_padding(kernel_size)[0],
)
self.ln_2 = nn.GroupNorm(1, inner_dim)
self.activation = nn.LeakyReLU(lrelu_slope)
self.conv_2 = PointwiseConv1d(inner_dim, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Shapes:
x: :math: `[B, T, C]`
"""
x = self.ln_1(x)
x = x.permute(0, 2, 1)
x = self.conv_1(x)
x = self.conv_act(x)
x = self.depthwise(x)
x = self.ln_2(x)
x = self.activation(x)
x = self.conv_2(x)
x = x.permute(0, 2, 1)
x = self.dropout(x)
return x
class ConformerMultiHeadedSelfAttention(nn.Module):
"""
Conformer employ multi-headed self-attention (MHSA) while integrating an important technique from Transformer-XL,
the relative sinusoidal positional encoding scheme. The relative positional encoding allows the self-attention
module to generalize better on different input length and the resulting encoder is more robust to the variance of
the utterance length. Conformer use prenorm residual units with dropout which helps training
and regularizing deeper models.
Args:
d_model (int): The dimension of model
num_heads (int): The number of attention heads.
dropout_p (float): probability of dropout
Inputs: inputs, mask
- **inputs** (batch, time, dim): Tensor containing input vector
- **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
Returns:
- **outputs** (batch, time, dim): Tensor produces by relative multi headed self attention module.
"""
def __init__(self, d_model: int, num_heads: int, dropout_p: float):
super().__init__()
self.attention = RelativeMultiHeadAttention(d_model=d_model, num_heads=num_heads)
self.dropout = nn.Dropout(p=dropout_p)
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: torch.Tensor,
encoding: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
batch_size, seq_length, _ = key.size() # pylint: disable=unused-variable
encoding = encoding[:, : key.shape[1]]
encoding = encoding.repeat(batch_size, 1, 1)
outputs, attn = self.attention(query, key, value, pos_embedding=encoding, mask=mask)
outputs = self.dropout(outputs)
return outputs, attn
class RelativeMultiHeadAttention(nn.Module):
"""
Multi-head attention with relative positional encoding.
This concept was proposed in the "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
Args:
d_model (int): The dimension of model
num_heads (int): The number of attention heads.
Inputs: query, key, value, pos_embedding, mask
- **query** (batch, time, dim): Tensor containing query vector
- **key** (batch, time, dim): Tensor containing key vector
- **value** (batch, time, dim): Tensor containing value vector
- **pos_embedding** (batch, time, dim): Positional embedding tensor
- **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
Returns:
- **outputs**: Tensor produces by relative multi head attention module.
"""
def __init__(
self,
d_model: int = 512,
num_heads: int = 16,
):
super().__init__()
assert d_model % num_heads == 0, "d_model % num_heads should be zero."
self.d_model = d_model
self.d_head = int(d_model / num_heads)
self.num_heads = num_heads
self.sqrt_dim = math.sqrt(d_model)
self.query_proj = nn.Linear(d_model, d_model)
self.key_proj = nn.Linear(d_model, d_model, bias=False)
self.value_proj = nn.Linear(d_model, d_model, bias=False)
self.pos_proj = nn.Linear(d_model, d_model, bias=False)
self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
torch.nn.init.xavier_uniform_(self.u_bias)
torch.nn.init.xavier_uniform_(self.v_bias)
self.out_proj = nn.Linear(d_model, d_model)
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
pos_embedding: torch.Tensor,
mask: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
batch_size = query.shape[0]
query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
pos_embedding = self.pos_proj(pos_embedding).view(batch_size, -1, self.num_heads, self.d_head)
u_bias = self.u_bias.expand_as(query)
v_bias = self.v_bias.expand_as(query)
a = (query + u_bias).transpose(1, 2)
content_score = a @ key.transpose(2, 3)
b = (query + v_bias).transpose(1, 2)
pos_score = b @ pos_embedding.permute(0, 2, 3, 1)
pos_score = self._relative_shift(pos_score)
score = content_score + pos_score
score = score * (1.0 / self.sqrt_dim)
score.masked_fill_(mask, -1e9)
attn = F.softmax(score, -1)
context = (attn @ value).transpose(1, 2)
context = context.contiguous().view(batch_size, -1, self.d_model)
return self.out_proj(context), attn
def _relative_shift(self, pos_score: torch.Tensor) -> torch.Tensor: # pylint: disable=no-self-use
batch_size, num_heads, seq_length1, seq_length2 = pos_score.size()
zeros = torch.zeros((batch_size, num_heads, seq_length1, 1), device=pos_score.device)
padded_pos_score = torch.cat([zeros, pos_score], dim=-1)
padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
return pos_score
class MultiHeadAttention(nn.Module):
"""
input:
query --- [N, T_q, query_dim]
key --- [N, T_k, key_dim]
output:
out --- [N, T_q, num_units]
"""
def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
super().__init__()
self.num_units = num_units
self.num_heads = num_heads
self.key_dim = key_dim
self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor:
querys = self.W_query(query) # [N, T_q, num_units]
keys = self.W_key(key) # [N, T_k, num_units]
values = self.W_value(key)
split_size = self.num_units // self.num_heads
querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) # [h, N, T_q, num_units/h]
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
# score = softmax(QK^T / (d_k ** 0.5))
scores = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k]
scores = scores / (self.key_dim**0.5)
scores = F.softmax(scores, dim=3)
# out = score * V
out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
return out

View File

@ -0,0 +1,671 @@
from typing import Tuple
import torch
import torch.nn as nn # pylint: disable=consider-using-from-import
import torch.nn.functional as F
from torch.nn.utils import parametrize
from TTS.tts.layers.delightful_tts.kernel_predictor import KernelPredictor
def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
pad = kernel_size // 2
return (pad, pad - (kernel_size + 1) % 2)
class ConvNorm(nn.Module):
"""A 1-dimensional convolutional layer with optional weight normalization.
This layer wraps a 1D convolutional layer from PyTorch and applies
optional weight normalization. The layer can be used in a similar way to
the convolutional layers in PyTorch's `torch.nn` module.
Args:
in_channels (int): The number of channels in the input signal.
out_channels (int): The number of channels in the output signal.
kernel_size (int, optional): The size of the convolving kernel.
Defaults to 1.
stride (int, optional): The stride of the convolution. Defaults to 1.
padding (int, optional): Zero-padding added to both sides of the input.
If `None`, the padding will be calculated so that the output has
the same length as the input. Defaults to `None`.
dilation (int, optional): Spacing between kernel elements. Defaults to 1.
bias (bool, optional): If `True`, add bias after convolution. Defaults to `True`.
w_init_gain (str, optional): The weight initialization function to use.
Can be either 'linear' or 'relu'. Defaults to 'linear'.
use_weight_norm (bool, optional): If `True`, apply weight normalization
to the convolutional weights. Defaults to `False`.
Shapes:
- Input: :math:`[N, D, T]`
- Output: :math:`[N, out_dim, T]` where `out_dim` is the number of output dimensions.
"""
def __init__(
self,
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=None,
dilation=1,
bias=True,
w_init_gain="linear",
use_weight_norm=False,
):
super(ConvNorm, self).__init__() # pylint: disable=super-with-arguments
if padding is None:
assert kernel_size % 2 == 1
padding = int(dilation * (kernel_size - 1) / 2)
self.kernel_size = kernel_size
self.dilation = dilation
self.use_weight_norm = use_weight_norm
conv_fn = nn.Conv1d
self.conv = conv_fn(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias,
)
nn.init.xavier_uniform_(self.conv.weight, gain=nn.init.calculate_gain(w_init_gain))
if self.use_weight_norm:
self.conv = nn.utils.parametrizations.weight_norm(self.conv)
def forward(self, signal, mask=None):
conv_signal = self.conv(signal)
if mask is not None:
# always re-zero output if mask is
# available to match zero-padding
conv_signal = conv_signal * mask
return conv_signal
class ConvLSTMLinear(nn.Module):
def __init__(
self,
in_dim,
out_dim,
n_layers=2,
n_channels=256,
kernel_size=3,
p_dropout=0.1,
lstm_type="bilstm",
use_linear=True,
):
super(ConvLSTMLinear, self).__init__() # pylint: disable=super-with-arguments
self.out_dim = out_dim
self.lstm_type = lstm_type
self.use_linear = use_linear
self.dropout = nn.Dropout(p=p_dropout)
convolutions = []
for i in range(n_layers):
conv_layer = ConvNorm(
in_dim if i == 0 else n_channels,
n_channels,
kernel_size=kernel_size,
stride=1,
padding=int((kernel_size - 1) / 2),
dilation=1,
w_init_gain="relu",
)
conv_layer = nn.utils.parametrizations.weight_norm(conv_layer.conv, name="weight")
convolutions.append(conv_layer)
self.convolutions = nn.ModuleList(convolutions)
if not self.use_linear:
n_channels = out_dim
if self.lstm_type != "":
use_bilstm = False
lstm_channels = n_channels
if self.lstm_type == "bilstm":
use_bilstm = True
lstm_channels = int(n_channels // 2)
self.bilstm = nn.LSTM(n_channels, lstm_channels, 1, batch_first=True, bidirectional=use_bilstm)
lstm_norm_fn_pntr = nn.utils.spectral_norm
self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0")
if self.lstm_type == "bilstm":
self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0_reverse")
if self.use_linear:
self.dense = nn.Linear(n_channels, out_dim)
def run_padded_sequence(self, context, lens):
context_embedded = []
for b_ind in range(context.size()[0]): # TODO: speed up
curr_context = context[b_ind : b_ind + 1, :, : lens[b_ind]].clone()
for conv in self.convolutions:
curr_context = self.dropout(F.relu(conv(curr_context)))
context_embedded.append(curr_context[0].transpose(0, 1))
context = nn.utils.rnn.pad_sequence(context_embedded, batch_first=True)
return context
def run_unsorted_inputs(self, fn, context, lens): # pylint: disable=no-self-use
lens_sorted, ids_sorted = torch.sort(lens, descending=True)
unsort_ids = [0] * lens.size(0)
for i in range(len(ids_sorted)): # pylint: disable=consider-using-enumerate
unsort_ids[ids_sorted[i]] = i
lens_sorted = lens_sorted.long().cpu()
context = context[ids_sorted]
context = nn.utils.rnn.pack_padded_sequence(context, lens_sorted, batch_first=True)
context = fn(context)[0]
context = nn.utils.rnn.pad_packed_sequence(context, batch_first=True)[0]
# map back to original indices
context = context[unsort_ids]
return context
def forward(self, context, lens):
if context.size()[0] > 1:
context = self.run_padded_sequence(context, lens)
# to B, D, T
context = context.transpose(1, 2)
else:
for conv in self.convolutions:
context = self.dropout(F.relu(conv(context)))
if self.lstm_type != "":
context = context.transpose(1, 2)
self.bilstm.flatten_parameters()
if lens is not None:
context = self.run_unsorted_inputs(self.bilstm, context, lens)
else:
context = self.bilstm(context)[0]
context = context.transpose(1, 2)
x_hat = context
if self.use_linear:
x_hat = self.dense(context.transpose(1, 2)).transpose(1, 2)
return x_hat
class DepthWiseConv1d(nn.Module):
def __init__(self, in_channels: int, out_channels: int, kernel_size: int, padding: int):
super().__init__()
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding, groups=in_channels)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.conv(x)
class PointwiseConv1d(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
stride: int = 1,
padding: int = 0,
bias: bool = True,
):
super().__init__()
self.conv = nn.Conv1d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
stride=stride,
padding=padding,
bias=bias,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.conv(x)
class BSConv1d(nn.Module):
"""https://arxiv.org/pdf/2003.13549.pdf"""
def __init__(self, channels_in: int, channels_out: int, kernel_size: int, padding: int):
super().__init__()
self.pointwise = nn.Conv1d(channels_in, channels_out, kernel_size=1)
self.depthwise = nn.Conv1d(
channels_out,
channels_out,
kernel_size=kernel_size,
padding=padding,
groups=channels_out,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x1 = self.pointwise(x)
x2 = self.depthwise(x1)
return x2
class BSConv2d(nn.Module):
"""https://arxiv.org/pdf/2003.13549.pdf"""
def __init__(self, channels_in: int, channels_out: int, kernel_size: int, padding: int):
super().__init__()
self.pointwise = nn.Conv2d(channels_in, channels_out, kernel_size=1)
self.depthwise = nn.Conv2d(
channels_out,
channels_out,
kernel_size=kernel_size,
padding=padding,
groups=channels_out,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x1 = self.pointwise(x)
x2 = self.depthwise(x1)
return x2
class Conv1dGLU(nn.Module):
"""From DeepVoice 3"""
def __init__(self, d_model: int, kernel_size: int, padding: int, embedding_dim: int):
super().__init__()
self.conv = BSConv1d(d_model, 2 * d_model, kernel_size=kernel_size, padding=padding)
self.embedding_proj = nn.Linear(embedding_dim, d_model)
self.register_buffer("sqrt", torch.sqrt(torch.FloatTensor([0.5])).squeeze(0))
self.softsign = torch.nn.Softsign()
def forward(self, x: torch.Tensor, embeddings: torch.Tensor) -> torch.Tensor:
x = x.permute((0, 2, 1))
residual = x
x = self.conv(x)
splitdim = 1
a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
embeddings = self.embedding_proj(embeddings).unsqueeze(2)
softsign = self.softsign(embeddings)
softsign = softsign.expand_as(a)
a = a + softsign
x = a * torch.sigmoid(b)
x = x + residual
x = x * self.sqrt
x = x.permute((0, 2, 1))
return x
class ConvTransposed(nn.Module):
"""
A 1D convolutional transposed layer for PyTorch.
This layer applies a 1D convolutional transpose operation to its input tensor,
where the number of channels of the input tensor is the same as the number of channels of the output tensor.
Attributes:
in_channels (int): The number of channels in the input tensor.
out_channels (int): The number of channels in the output tensor.
kernel_size (int): The size of the convolutional kernel. Default: 1.
padding (int): The number of padding elements to add to the input tensor. Default: 0.
conv (BSConv1d): The 1D convolutional transpose layer.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 1,
padding: int = 0,
):
super().__init__()
self.conv = BSConv1d(
in_channels,
out_channels,
kernel_size=kernel_size,
padding=padding,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x.contiguous().transpose(1, 2)
x = self.conv(x)
x = x.contiguous().transpose(1, 2)
return x
class DepthwiseConvModule(nn.Module):
def __init__(self, dim: int, kernel_size: int = 7, expansion: int = 4, lrelu_slope: float = 0.3):
super().__init__()
padding = calc_same_padding(kernel_size)
self.depthwise = nn.Conv1d(
dim,
dim * expansion,
kernel_size=kernel_size,
padding=padding[0],
groups=dim,
)
self.act = nn.LeakyReLU(lrelu_slope)
self.out = nn.Conv1d(dim * expansion, dim, 1, 1, 0)
self.ln = nn.LayerNorm(dim)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.ln(x)
x = x.permute((0, 2, 1))
x = self.depthwise(x)
x = self.act(x)
x = self.out(x)
x = x.permute((0, 2, 1))
return x
class AddCoords(nn.Module):
def __init__(self, rank: int, with_r: bool = False):
super().__init__()
self.rank = rank
self.with_r = with_r
def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.rank == 1:
batch_size_shape, channel_in_shape, dim_x = x.shape # pylint: disable=unused-variable
xx_range = torch.arange(dim_x, dtype=torch.int32)
xx_channel = xx_range[None, None, :]
xx_channel = xx_channel.float() / (dim_x - 1)
xx_channel = xx_channel * 2 - 1
xx_channel = xx_channel.repeat(batch_size_shape, 1, 1)
xx_channel = xx_channel.to(x.device)
out = torch.cat([x, xx_channel], dim=1)
if self.with_r:
rr = torch.sqrt(torch.pow(xx_channel - 0.5, 2))
out = torch.cat([out, rr], dim=1)
elif self.rank == 2:
batch_size_shape, channel_in_shape, dim_y, dim_x = x.shape
xx_ones = torch.ones([1, 1, 1, dim_x], dtype=torch.int32)
yy_ones = torch.ones([1, 1, 1, dim_y], dtype=torch.int32)
xx_range = torch.arange(dim_y, dtype=torch.int32)
yy_range = torch.arange(dim_x, dtype=torch.int32)
xx_range = xx_range[None, None, :, None]
yy_range = yy_range[None, None, :, None]
xx_channel = torch.matmul(xx_range, xx_ones)
yy_channel = torch.matmul(yy_range, yy_ones)
# transpose y
yy_channel = yy_channel.permute(0, 1, 3, 2)
xx_channel = xx_channel.float() / (dim_y - 1)
yy_channel = yy_channel.float() / (dim_x - 1)
xx_channel = xx_channel * 2 - 1
yy_channel = yy_channel * 2 - 1
xx_channel = xx_channel.repeat(batch_size_shape, 1, 1, 1)
yy_channel = yy_channel.repeat(batch_size_shape, 1, 1, 1)
xx_channel = xx_channel.to(x.device)
yy_channel = yy_channel.to(x.device)
out = torch.cat([x, xx_channel, yy_channel], dim=1)
if self.with_r:
rr = torch.sqrt(torch.pow(xx_channel - 0.5, 2) + torch.pow(yy_channel - 0.5, 2))
out = torch.cat([out, rr], dim=1)
elif self.rank == 3:
batch_size_shape, channel_in_shape, dim_z, dim_y, dim_x = x.shape
xx_ones = torch.ones([1, 1, 1, 1, dim_x], dtype=torch.int32)
yy_ones = torch.ones([1, 1, 1, 1, dim_y], dtype=torch.int32)
zz_ones = torch.ones([1, 1, 1, 1, dim_z], dtype=torch.int32)
xy_range = torch.arange(dim_y, dtype=torch.int32)
xy_range = xy_range[None, None, None, :, None]
yz_range = torch.arange(dim_z, dtype=torch.int32)
yz_range = yz_range[None, None, None, :, None]
zx_range = torch.arange(dim_x, dtype=torch.int32)
zx_range = zx_range[None, None, None, :, None]
xy_channel = torch.matmul(xy_range, xx_ones)
xx_channel = torch.cat([xy_channel + i for i in range(dim_z)], dim=2)
yz_channel = torch.matmul(yz_range, yy_ones)
yz_channel = yz_channel.permute(0, 1, 3, 4, 2)
yy_channel = torch.cat([yz_channel + i for i in range(dim_x)], dim=4)
zx_channel = torch.matmul(zx_range, zz_ones)
zx_channel = zx_channel.permute(0, 1, 4, 2, 3)
zz_channel = torch.cat([zx_channel + i for i in range(dim_y)], dim=3)
xx_channel = xx_channel.to(x.device)
yy_channel = yy_channel.to(x.device)
zz_channel = zz_channel.to(x.device)
out = torch.cat([x, xx_channel, yy_channel, zz_channel], dim=1)
if self.with_r:
rr = torch.sqrt(
torch.pow(xx_channel - 0.5, 2) + torch.pow(yy_channel - 0.5, 2) + torch.pow(zz_channel - 0.5, 2)
)
out = torch.cat([out, rr], dim=1)
else:
raise NotImplementedError
return out
class CoordConv1d(nn.modules.conv.Conv1d):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
padding: int = 0,
dilation: int = 1,
groups: int = 1,
bias: bool = True,
with_r: bool = False,
):
super().__init__(
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
)
self.rank = 1
self.addcoords = AddCoords(self.rank, with_r)
self.conv = nn.Conv1d(
in_channels + self.rank + int(with_r),
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.addcoords(x)
x = self.conv(x)
return x
class CoordConv2d(nn.modules.conv.Conv2d):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
padding: int = 0,
dilation: int = 1,
groups: int = 1,
bias: bool = True,
with_r: bool = False,
):
super().__init__(
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
)
self.rank = 2
self.addcoords = AddCoords(self.rank, with_r)
self.conv = nn.Conv2d(
in_channels + self.rank + int(with_r),
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.addcoords(x)
x = self.conv(x)
return x
class LVCBlock(torch.nn.Module):
"""the location-variable convolutions"""
def __init__( # pylint: disable=dangerous-default-value
self,
in_channels,
cond_channels,
stride,
dilations=[1, 3, 9, 27],
lReLU_slope=0.2,
conv_kernel_size=3,
cond_hop_length=256,
kpnet_hidden_channels=64,
kpnet_conv_size=3,
kpnet_dropout=0.0,
):
super().__init__()
self.cond_hop_length = cond_hop_length
self.conv_layers = len(dilations)
self.conv_kernel_size = conv_kernel_size
self.kernel_predictor = KernelPredictor(
cond_channels=cond_channels,
conv_in_channels=in_channels,
conv_out_channels=2 * in_channels,
conv_layers=len(dilations),
conv_kernel_size=conv_kernel_size,
kpnet_hidden_channels=kpnet_hidden_channels,
kpnet_conv_size=kpnet_conv_size,
kpnet_dropout=kpnet_dropout,
kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope},
)
self.convt_pre = nn.Sequential(
nn.LeakyReLU(lReLU_slope),
nn.utils.parametrizations.weight_norm(
nn.ConvTranspose1d(
in_channels,
in_channels,
2 * stride,
stride=stride,
padding=stride // 2 + stride % 2,
output_padding=stride % 2,
)
),
)
self.conv_blocks = nn.ModuleList()
for dilation in dilations:
self.conv_blocks.append(
nn.Sequential(
nn.LeakyReLU(lReLU_slope),
nn.utils.parametrizations.weight_norm(
nn.Conv1d(
in_channels,
in_channels,
conv_kernel_size,
padding=dilation * (conv_kernel_size - 1) // 2,
dilation=dilation,
)
),
nn.LeakyReLU(lReLU_slope),
)
)
def forward(self, x, c):
"""forward propagation of the location-variable convolutions.
Args:
x (Tensor): the input sequence (batch, in_channels, in_length)
c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
Returns:
Tensor: the output sequence (batch, in_channels, in_length)
"""
_, in_channels, _ = x.shape # (B, c_g, L')
x = self.convt_pre(x) # (B, c_g, stride * L')
kernels, bias = self.kernel_predictor(c)
for i, conv in enumerate(self.conv_blocks):
output = conv(x) # (B, c_g, stride * L')
k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length)
b = bias[:, i, :, :] # (B, 2 * c_g, cond_length)
output = self.location_variable_convolution(
output, k, b, hop_size=self.cond_hop_length
) # (B, 2 * c_g, stride * L'): LVC
x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh(
output[:, in_channels:, :]
) # (B, c_g, stride * L'): GAU
return x
def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): # pylint: disable=no-self-use
"""perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
Args:
x (Tensor): the input sequence (batch, in_channels, in_length).
kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
dilation (int): the dilation of convolution.
hop_size (int): the hop_size of the conditioning sequence.
Returns:
(Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
"""
batch, _, in_length = x.shape
batch, _, out_channels, kernel_size, kernel_length = kernel.shape
assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"
padding = dilation * int((kernel_size - 1) / 2)
x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding)
x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding)
if hop_size < dilation:
x = F.pad(x, (0, dilation), "constant", 0)
x = x.unfold(
3, dilation, dilation
) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
x = x[:, :, :, :, :hop_size]
x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size)
o = torch.einsum("bildsk,biokl->bolsd", x, kernel)
o = o.to(memory_format=torch.channels_last_3d)
bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
o = o + bias
o = o.contiguous().view(batch, out_channels, -1)
return o
def remove_weight_norm(self):
self.kernel_predictor.remove_weight_norm()
parametrize.remove_parametrizations(self.convt_pre[1], "weight")
for block in self.conv_blocks:
parametrize.remove_parametrizations(block[1], "weight")

View File

@ -0,0 +1,261 @@
from typing import List, Tuple, Union
import torch
import torch.nn as nn # pylint: disable=consider-using-from-import
import torch.nn.functional as F
from TTS.tts.layers.delightful_tts.conformer import ConformerMultiHeadedSelfAttention
from TTS.tts.layers.delightful_tts.conv_layers import CoordConv1d
from TTS.tts.layers.delightful_tts.networks import STL
def get_mask_from_lengths(lengths: torch.Tensor) -> torch.Tensor:
batch_size = lengths.shape[0]
max_len = torch.max(lengths).item()
ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1)
mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
return mask
def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor:
return torch.ceil(lens / stride).int()
class ReferenceEncoder(nn.Module):
"""
Referance encoder for utterance and phoneme prosody encoders. Reference encoder
made up of convolution and RNN layers.
Args:
num_mels (int): Number of mel frames to produce.
ref_enc_filters (list[int]): List of channel sizes for encoder layers.
ref_enc_size (int): Size of the kernel for the conv layers.
ref_enc_strides (List[int]): List of strides to use for conv layers.
ref_enc_gru_size (int): Number of hidden features for the gated recurrent unit.
Inputs: inputs, mask
- **inputs** (batch, dim, time): Tensor containing mel vector
- **lengths** (batch): Tensor containing the mel lengths.
Returns:
- **outputs** (batch, time, dim): Tensor produced by Reference Encoder.
"""
def __init__(
self,
num_mels: int,
ref_enc_filters: List[Union[int, int, int, int, int, int]],
ref_enc_size: int,
ref_enc_strides: List[Union[int, int, int, int, int]],
ref_enc_gru_size: int,
):
super().__init__()
n_mel_channels = num_mels
self.n_mel_channels = n_mel_channels
K = len(ref_enc_filters)
filters = [self.n_mel_channels] + ref_enc_filters
strides = [1] + ref_enc_strides
# Use CoordConv at the first layer to better preserve positional information: https://arxiv.org/pdf/1811.02122.pdf
convs = [
CoordConv1d(
in_channels=filters[0],
out_channels=filters[0 + 1],
kernel_size=ref_enc_size,
stride=strides[0],
padding=ref_enc_size // 2,
with_r=True,
)
]
convs2 = [
nn.Conv1d(
in_channels=filters[i],
out_channels=filters[i + 1],
kernel_size=ref_enc_size,
stride=strides[i],
padding=ref_enc_size // 2,
)
for i in range(1, K)
]
convs.extend(convs2)
self.convs = nn.ModuleList(convs)
self.norms = nn.ModuleList([nn.InstanceNorm1d(num_features=ref_enc_filters[i], affine=True) for i in range(K)])
self.gru = nn.GRU(
input_size=ref_enc_filters[-1],
hidden_size=ref_enc_gru_size,
batch_first=True,
)
def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
inputs --- [N, n_mels, timesteps]
outputs --- [N, E//2]
"""
mel_masks = get_mask_from_lengths(mel_lens).unsqueeze(1)
x = x.masked_fill(mel_masks, 0)
for conv, norm in zip(self.convs, self.norms):
x = conv(x)
x = F.leaky_relu(x, 0.3) # [N, 128, Ty//2^K, n_mels//2^K]
x = norm(x)
for _ in range(2):
mel_lens = stride_lens(mel_lens)
mel_masks = get_mask_from_lengths(mel_lens)
x = x.masked_fill(mel_masks.unsqueeze(1), 0)
x = x.permute((0, 2, 1))
x = torch.nn.utils.rnn.pack_padded_sequence(x, mel_lens.cpu().int(), batch_first=True, enforce_sorted=False)
self.gru.flatten_parameters()
x, memory = self.gru(x) # memory --- [N, Ty, E//2], out --- [1, N, E//2]
x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
return x, memory, mel_masks
def calculate_channels( # pylint: disable=no-self-use
self, L: int, kernel_size: int, stride: int, pad: int, n_convs: int
) -> int:
for _ in range(n_convs):
L = (L - kernel_size + 2 * pad) // stride + 1
return L
class UtteranceLevelProsodyEncoder(nn.Module):
def __init__(
self,
num_mels: int,
ref_enc_filters: List[Union[int, int, int, int, int, int]],
ref_enc_size: int,
ref_enc_strides: List[Union[int, int, int, int, int]],
ref_enc_gru_size: int,
dropout: float,
n_hidden: int,
bottleneck_size_u: int,
token_num: int,
):
"""
Encoder to extract prosody from utterance. it is made up of a reference encoder
with a couple of linear layers and style token layer with dropout.
Args:
num_mels (int): Number of mel frames to produce.
ref_enc_filters (list[int]): List of channel sizes for ref encoder layers.
ref_enc_size (int): Size of the kernel for the ref encoder conv layers.
ref_enc_strides (List[int]): List of strides to use for teh ref encoder conv layers.
ref_enc_gru_size (int): Number of hidden features for the gated recurrent unit.
dropout (float): Probability of dropout.
n_hidden (int): Size of hidden layers.
bottleneck_size_u (int): Size of the bottle neck layer.
Inputs: inputs, mask
- **inputs** (batch, dim, time): Tensor containing mel vector
- **lengths** (batch): Tensor containing the mel lengths.
Returns:
- **outputs** (batch, 1, dim): Tensor produced by Utterance Level Prosody Encoder.
"""
super().__init__()
self.E = n_hidden
self.d_q = self.d_k = n_hidden
bottleneck_size = bottleneck_size_u
self.encoder = ReferenceEncoder(
ref_enc_filters=ref_enc_filters,
ref_enc_gru_size=ref_enc_gru_size,
ref_enc_size=ref_enc_size,
ref_enc_strides=ref_enc_strides,
num_mels=num_mels,
)
self.encoder_prj = nn.Linear(ref_enc_gru_size, self.E // 2)
self.stl = STL(n_hidden=n_hidden, token_num=token_num)
self.encoder_bottleneck = nn.Linear(self.E, bottleneck_size)
self.dropout = nn.Dropout(dropout)
def forward(self, mels: torch.Tensor, mel_lens: torch.Tensor) -> torch.Tensor:
"""
Shapes:
mels: :math: `[B, C, T]`
mel_lens: :math: `[B]`
out --- [N, seq_len, E]
"""
_, embedded_prosody, _ = self.encoder(mels, mel_lens)
# Bottleneck
embedded_prosody = self.encoder_prj(embedded_prosody)
# Style Token
out = self.encoder_bottleneck(self.stl(embedded_prosody))
out = self.dropout(out)
out = out.view((-1, 1, out.shape[3]))
return out
class PhonemeLevelProsodyEncoder(nn.Module):
def __init__(
self,
num_mels: int,
ref_enc_filters: List[Union[int, int, int, int, int, int]],
ref_enc_size: int,
ref_enc_strides: List[Union[int, int, int, int, int]],
ref_enc_gru_size: int,
dropout: float,
n_hidden: int,
n_heads: int,
bottleneck_size_p: int,
):
super().__init__()
self.E = n_hidden
self.d_q = self.d_k = n_hidden
bottleneck_size = bottleneck_size_p
self.encoder = ReferenceEncoder(
ref_enc_filters=ref_enc_filters,
ref_enc_gru_size=ref_enc_gru_size,
ref_enc_size=ref_enc_size,
ref_enc_strides=ref_enc_strides,
num_mels=num_mels,
)
self.encoder_prj = nn.Linear(ref_enc_gru_size, n_hidden)
self.attention = ConformerMultiHeadedSelfAttention(
d_model=n_hidden,
num_heads=n_heads,
dropout_p=dropout,
)
self.encoder_bottleneck = nn.Linear(n_hidden, bottleneck_size)
def forward(
self,
x: torch.Tensor,
src_mask: torch.Tensor,
mels: torch.Tensor,
mel_lens: torch.Tensor,
encoding: torch.Tensor,
) -> torch.Tensor:
"""
x --- [N, seq_len, encoder_embedding_dim]
mels --- [N, Ty/r, n_mels*r], r=1
out --- [N, seq_len, bottleneck_size]
attn --- [N, seq_len, ref_len], Ty/r = ref_len
"""
embedded_prosody, _, mel_masks = self.encoder(mels, mel_lens)
# Bottleneck
embedded_prosody = self.encoder_prj(embedded_prosody)
attn_mask = mel_masks.view((mel_masks.shape[0], 1, 1, -1))
x, _ = self.attention(
query=x,
key=embedded_prosody,
value=embedded_prosody,
mask=attn_mask,
encoding=encoding,
)
x = self.encoder_bottleneck(x)
x = x.masked_fill(src_mask.unsqueeze(-1), 0.0)
return x

View File

@ -0,0 +1,82 @@
from typing import Callable, Tuple
import torch
import torch.nn as nn # pylint: disable=consider-using-from-import
from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
from TTS.tts.utils.helpers import average_over_durations
class EnergyAdaptor(nn.Module): # pylint: disable=abstract-method
"""Variance Adaptor with an added 1D conv layer. Used to
get energy embeddings.
Args:
channels_in (int): Number of in channels for conv layers.
channels_out (int): Number of out channels.
kernel_size (int): Size the kernel for the conv layers.
dropout (float): Probability of dropout.
lrelu_slope (float): Slope for the leaky relu.
emb_kernel_size (int): Size the kernel for the pitch embedding.
Inputs: inputs, mask
- **inputs** (batch, time1, dim): Tensor containing input vector
- **target** (batch, 1, time2): Tensor containing the energy target
- **dr** (batch, time1): Tensor containing aligner durations vector
- **mask** (batch, time1): Tensor containing indices to be masked
Returns:
- **energy prediction** (batch, 1, time1): Tensor produced by energy predictor
- **energy embedding** (batch, channels, time1): Tensor produced energy adaptor
- **average energy target(train only)** (batch, 1, time1): Tensor produced after averaging over durations
"""
def __init__(
self,
channels_in: int,
channels_hidden: int,
channels_out: int,
kernel_size: int,
dropout: float,
lrelu_slope: float,
emb_kernel_size: int,
):
super().__init__()
self.energy_predictor = VariancePredictor(
channels_in=channels_in,
channels=channels_hidden,
channels_out=channels_out,
kernel_size=kernel_size,
p_dropout=dropout,
lrelu_slope=lrelu_slope,
)
self.energy_emb = nn.Conv1d(
1,
channels_hidden,
kernel_size=emb_kernel_size,
padding=int((emb_kernel_size - 1) / 2),
)
def get_energy_embedding_train(
self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Shapes:
x: :math: `[B, T_src, C]`
target: :math: `[B, 1, T_max2]`
dr: :math: `[B, T_src]`
mask: :math: `[B, T_src]`
"""
energy_pred = self.energy_predictor(x, mask)
energy_pred.unsqueeze_(1)
avg_energy_target = average_over_durations(target, dr)
energy_emb = self.energy_emb(avg_energy_target)
return energy_pred, avg_energy_target, energy_emb
def get_energy_embedding(self, x: torch.Tensor, mask: torch.Tensor, energy_transform: Callable) -> torch.Tensor:
energy_pred = self.energy_predictor(x, mask)
energy_pred.unsqueeze_(1)
if energy_transform is not None:
energy_pred = energy_transform(energy_pred, (~mask).sum(dim=(1, 2)), self.pitch_mean, self.pitch_std)
energy_emb_pred = self.energy_emb(energy_pred)
return energy_emb_pred, energy_pred

View File

@ -0,0 +1,128 @@
import torch.nn as nn # pylint: disable=consider-using-from-import
from torch.nn.utils import parametrize
class KernelPredictor(nn.Module):
"""Kernel predictor for the location-variable convolutions
Args:
cond_channels (int): number of channel for the conditioning sequence,
conv_in_channels (int): number of channel for the input sequence,
conv_out_channels (int): number of channel for the output sequence,
conv_layers (int): number of layers
"""
def __init__( # pylint: disable=dangerous-default-value
self,
cond_channels,
conv_in_channels,
conv_out_channels,
conv_layers,
conv_kernel_size=3,
kpnet_hidden_channels=64,
kpnet_conv_size=3,
kpnet_dropout=0.0,
kpnet_nonlinear_activation="LeakyReLU",
kpnet_nonlinear_activation_params={"negative_slope": 0.1},
):
super().__init__()
self.conv_in_channels = conv_in_channels
self.conv_out_channels = conv_out_channels
self.conv_kernel_size = conv_kernel_size
self.conv_layers = conv_layers
kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w
kpnet_bias_channels = conv_out_channels * conv_layers # l_b
self.input_conv = nn.Sequential(
nn.utils.parametrizations.weight_norm(
nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)
),
getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
)
self.residual_convs = nn.ModuleList()
padding = (kpnet_conv_size - 1) // 2
for _ in range(3):
self.residual_convs.append(
nn.Sequential(
nn.Dropout(kpnet_dropout),
nn.utils.parametrizations.weight_norm(
nn.Conv1d(
kpnet_hidden_channels,
kpnet_hidden_channels,
kpnet_conv_size,
padding=padding,
bias=True,
)
),
getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
nn.utils.parametrizations.weight_norm(
nn.Conv1d(
kpnet_hidden_channels,
kpnet_hidden_channels,
kpnet_conv_size,
padding=padding,
bias=True,
)
),
getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
)
)
self.kernel_conv = nn.utils.parametrizations.weight_norm(
nn.Conv1d(
kpnet_hidden_channels,
kpnet_kernel_channels,
kpnet_conv_size,
padding=padding,
bias=True,
)
)
self.bias_conv = nn.utils.parametrizations.weight_norm(
nn.Conv1d(
kpnet_hidden_channels,
kpnet_bias_channels,
kpnet_conv_size,
padding=padding,
bias=True,
)
)
def forward(self, c):
"""
Args:
c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
"""
batch, _, cond_length = c.shape
c = self.input_conv(c)
for residual_conv in self.residual_convs:
residual_conv.to(c.device)
c = c + residual_conv(c)
k = self.kernel_conv(c)
b = self.bias_conv(c)
kernels = k.contiguous().view(
batch,
self.conv_layers,
self.conv_in_channels,
self.conv_out_channels,
self.conv_kernel_size,
cond_length,
)
bias = b.contiguous().view(
batch,
self.conv_layers,
self.conv_out_channels,
cond_length,
)
return kernels, bias
def remove_weight_norm(self):
parametrize.remove_parametrizations(self.input_conv[0], "weight")
parametrize.remove_parametrizations(self.kernel_conv, "weight")
parametrize.remove_parametrizations(self.bias_conv, "weight")
for block in self.residual_convs:
parametrize.remove_parametrizations(block[1], "weight")
parametrize.remove_parametrizations(block[3], "weight")

View File

@ -0,0 +1,219 @@
import math
from typing import Tuple
import numpy as np
import torch
import torch.nn as nn # pylint: disable=consider-using-from-import
import torch.nn.functional as F
from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm
def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor:
assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..."
# Kaiming initialization
return torch.randn(shape) * np.sqrt(2 / shape[1])
def positional_encoding(d_model: int, length: int, device: torch.device) -> torch.Tensor:
pe = torch.zeros(length, d_model, device=device)
position = torch.arange(0, length, dtype=torch.float, device=device).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2, device=device).float() * -(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
return pe
class BottleneckLayer(nn.Module):
"""
Bottleneck layer for reducing the dimensionality of a tensor.
Args:
in_dim: The number of input dimensions.
reduction_factor: The factor by which to reduce the number of dimensions.
norm: The normalization method to use. Can be "weightnorm" or "instancenorm".
non_linearity: The non-linearity to use. Can be "relu" or "leakyrelu".
kernel_size: The size of the convolutional kernel.
use_partial_padding: Whether to use partial padding with the convolutional kernel.
Shape:
- Input: :math:`[N, in_dim]` where `N` is the batch size and `in_dim` is the number of input dimensions.
- Output: :math:`[N, out_dim]` where `out_dim` is the number of output dimensions.
"""
def __init__(
self,
in_dim,
reduction_factor,
norm="weightnorm",
non_linearity="relu",
kernel_size=3,
use_partial_padding=False, # pylint: disable=unused-argument
):
super(BottleneckLayer, self).__init__() # pylint: disable=super-with-arguments
self.reduction_factor = reduction_factor
reduced_dim = int(in_dim / reduction_factor)
self.out_dim = reduced_dim
if self.reduction_factor > 1:
fn = ConvNorm(in_dim, reduced_dim, kernel_size=kernel_size, use_weight_norm=(norm == "weightnorm"))
if norm == "instancenorm":
fn = nn.Sequential(fn, nn.InstanceNorm1d(reduced_dim, affine=True))
self.projection_fn = fn
self.non_linearity = nn.ReLU()
if non_linearity == "leakyrelu":
self.non_linearity = nn.LeakyReLU()
def forward(self, x):
if self.reduction_factor > 1:
x = self.projection_fn(x)
x = self.non_linearity(x)
return x
class GLUActivation(nn.Module):
"""Class that implements the Gated Linear Unit (GLU) activation function.
The GLU activation function is a variant of the Leaky ReLU activation function,
where the output of the activation function is gated by an input tensor.
"""
def __init__(self, slope: float):
super().__init__()
self.lrelu = nn.LeakyReLU(slope)
def forward(self, x: torch.Tensor) -> torch.Tensor:
out, gate = x.chunk(2, dim=1)
x = out * self.lrelu(gate)
return x
class StyleEmbedAttention(nn.Module):
def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
super().__init__()
self.num_units = num_units
self.num_heads = num_heads
self.key_dim = key_dim
self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
def forward(self, query: torch.Tensor, key_soft: torch.Tensor) -> torch.Tensor:
values = self.W_value(key_soft)
split_size = self.num_units // self.num_heads
values = torch.stack(torch.split(values, split_size, dim=2), dim=0)
out_soft = scores_soft = None
querys = self.W_query(query) # [N, T_q, num_units]
keys = self.W_key(key_soft) # [N, T_k, num_units]
# [h, N, T_q, num_units/h]
querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)
# [h, N, T_k, num_units/h]
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)
# [h, N, T_k, num_units/h]
# score = softmax(QK^T / (d_k ** 0.5))
scores_soft = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k]
scores_soft = scores_soft / (self.key_dim**0.5)
scores_soft = F.softmax(scores_soft, dim=3)
# out = score * V
# [h, N, T_q, num_units/h]
out_soft = torch.matmul(scores_soft, values)
out_soft = torch.cat(torch.split(out_soft, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
return out_soft # , scores_soft
class EmbeddingPadded(nn.Module):
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
super().__init__()
padding_mult = torch.ones((num_embeddings, 1), dtype=torch.int64)
padding_mult[padding_idx] = 0
self.register_buffer("padding_mult", padding_mult)
self.embeddings = nn.parameter.Parameter(initialize_embeddings((num_embeddings, embedding_dim)))
def forward(self, idx: torch.Tensor) -> torch.Tensor:
embeddings_zeroed = self.embeddings * self.padding_mult
x = F.embedding(idx, embeddings_zeroed)
return x
class EmbeddingProjBlock(nn.Module):
def __init__(self, embedding_dim: int):
super().__init__()
self.layers = nn.ModuleList(
[
nn.Linear(embedding_dim, embedding_dim),
nn.LeakyReLU(0.3),
nn.Linear(embedding_dim, embedding_dim),
nn.LeakyReLU(0.3),
]
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
res = x
for layer in self.layers:
x = layer(x)
x = x + res
return x
class LinearNorm(nn.Module):
def __init__(self, in_features: int, out_features: int, bias: bool = False):
super().__init__()
self.linear = nn.Linear(in_features, out_features, bias)
nn.init.xavier_uniform_(self.linear.weight)
if bias:
nn.init.constant_(self.linear.bias, 0.0)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.linear(x)
return x
class STL(nn.Module):
"""
A PyTorch module for the Style Token Layer (STL) as described in
"A Style-Based Generator Architecture for Generative Adversarial Networks"
(https://arxiv.org/abs/1812.04948)
The STL applies a multi-headed attention mechanism over the learned style tokens,
using the text input as the query and the style tokens as the keys and values.
The output of the attention mechanism is used as the text's style embedding.
Args:
token_num (int): The number of style tokens.
n_hidden (int): Number of hidden dimensions.
"""
def __init__(self, n_hidden: int, token_num: int):
super(STL, self).__init__() # pylint: disable=super-with-arguments
num_heads = 1
E = n_hidden
self.token_num = token_num
self.embed = nn.Parameter(torch.FloatTensor(self.token_num, E // num_heads))
d_q = E // 2
d_k = E // num_heads
self.attention = StyleEmbedAttention(query_dim=d_q, key_dim=d_k, num_units=E, num_heads=num_heads)
torch.nn.init.normal_(self.embed, mean=0, std=0.5)
def forward(self, x: torch.Tensor) -> torch.Tensor:
N = x.size(0)
query = x.unsqueeze(1) # [N, 1, E//2]
keys_soft = torch.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) # [N, token_num, E // num_heads]
# Weighted sum
emotion_embed_soft = self.attention(query, keys_soft)
return emotion_embed_soft

View File

@ -0,0 +1,65 @@
import torch
import torch.nn as nn # pylint: disable=consider-using-from-import
from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed
class PhonemeProsodyPredictor(nn.Module):
"""Non-parallel Prosody Predictor inspired by: https://arxiv.org/pdf/2102.00851.pdf
It consists of 2 layers of 1D convolutions each followed by a relu activation, layer norm
and dropout, then finally a linear layer.
Args:
hidden_size (int): Size of hidden channels.
kernel_size (int): Kernel size for the conv layers.
dropout: (float): Probability of dropout.
bottleneck_size (int): bottleneck size for last linear layer.
lrelu_slope (float): Slope of the leaky relu.
"""
def __init__(
self,
hidden_size: int,
kernel_size: int,
dropout: float,
bottleneck_size: int,
lrelu_slope: float,
):
super().__init__()
self.d_model = hidden_size
self.layers = nn.ModuleList(
[
ConvTransposed(
self.d_model,
self.d_model,
kernel_size=kernel_size,
padding=(kernel_size - 1) // 2,
),
nn.LeakyReLU(lrelu_slope),
nn.LayerNorm(self.d_model),
nn.Dropout(dropout),
ConvTransposed(
self.d_model,
self.d_model,
kernel_size=kernel_size,
padding=(kernel_size - 1) // 2,
),
nn.LeakyReLU(lrelu_slope),
nn.LayerNorm(self.d_model),
nn.Dropout(dropout),
]
)
self.predictor_bottleneck = nn.Linear(self.d_model, bottleneck_size)
def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
"""
Shapes:
x: :math: `[B, T, D]`
mask: :math: `[B, T]`
"""
mask = mask.unsqueeze(2)
for layer in self.layers:
x = layer(x)
x = x.masked_fill(mask, 0.0)
x = self.predictor_bottleneck(x)
return x

View File

@ -0,0 +1,88 @@
from typing import Callable, Tuple
import torch
import torch.nn as nn # pylint: disable=consider-using-from-import
from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
from TTS.tts.utils.helpers import average_over_durations
class PitchAdaptor(nn.Module): # pylint: disable=abstract-method
"""Module to get pitch embeddings via pitch predictor
Args:
n_input (int): Number of pitch predictor input channels.
n_hidden (int): Number of pitch predictor hidden channels.
n_out (int): Number of pitch predictor out channels.
kernel size (int): Size of the kernel for conv layers.
emb_kernel_size (int): Size the kernel for the pitch embedding.
p_dropout (float): Probability of dropout.
lrelu_slope (float): Slope for the leaky relu.
Inputs: inputs, mask
- **inputs** (batch, time1, dim): Tensor containing input vector
- **target** (batch, 1, time2): Tensor containing the pitch target
- **dr** (batch, time1): Tensor containing aligner durations vector
- **mask** (batch, time1): Tensor containing indices to be masked
Returns:
- **pitch prediction** (batch, 1, time1): Tensor produced by pitch predictor
- **pitch embedding** (batch, channels, time1): Tensor produced pitch pitch adaptor
- **average pitch target(train only)** (batch, 1, time1): Tensor produced after averaging over durations
"""
def __init__(
self,
n_input: int,
n_hidden: int,
n_out: int,
kernel_size: int,
emb_kernel_size: int,
p_dropout: float,
lrelu_slope: float,
):
super().__init__()
self.pitch_predictor = VariancePredictor(
channels_in=n_input,
channels=n_hidden,
channels_out=n_out,
kernel_size=kernel_size,
p_dropout=p_dropout,
lrelu_slope=lrelu_slope,
)
self.pitch_emb = nn.Conv1d(
1,
n_input,
kernel_size=emb_kernel_size,
padding=int((emb_kernel_size - 1) / 2),
)
def get_pitch_embedding_train(
self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Shapes:
x: :math: `[B, T_src, C]`
target: :math: `[B, 1, T_max2]`
dr: :math: `[B, T_src]`
mask: :math: `[B, T_src]`
"""
pitch_pred = self.pitch_predictor(x, mask) # [B, T_src, C_hidden], [B, T_src] --> [B, T_src]
pitch_pred.unsqueeze_(1) # --> [B, 1, T_src]
avg_pitch_target = average_over_durations(target, dr) # [B, 1, T_mel], [B, T_src] --> [B, 1, T_src]
pitch_emb = self.pitch_emb(avg_pitch_target) # [B, 1, T_src] --> [B, C_hidden, T_src]
return pitch_pred, avg_pitch_target, pitch_emb
def get_pitch_embedding(
self,
x: torch.Tensor,
mask: torch.Tensor,
pitch_transform: Callable,
pitch_mean: torch.Tensor,
pitch_std: torch.Tensor,
) -> torch.Tensor:
pitch_pred = self.pitch_predictor(x, mask)
if pitch_transform is not None:
pitch_pred = pitch_transform(pitch_pred, (~mask).sum(), pitch_mean, pitch_std)
pitch_pred.unsqueeze_(1)
pitch_emb_pred = self.pitch_emb(pitch_pred)
return pitch_emb_pred, pitch_pred

View File

@ -0,0 +1,68 @@
import torch
import torch.nn as nn # pylint: disable=consider-using-from-import
from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed
class VariancePredictor(nn.Module):
"""
Network is 2-layer 1D convolutions with leaky relu activation and then
followed by layer normalization then a dropout layer and finally an
extra linear layer to project the hidden states into the output sequence.
Args:
channels_in (int): Number of in channels for conv layers.
channels_out (int): Number of out channels for the last linear layer.
kernel_size (int): Size the kernel for the conv layers.
p_dropout (float): Probability of dropout.
lrelu_slope (float): Slope for the leaky relu.
Inputs: inputs, mask
- **inputs** (batch, time, dim): Tensor containing input vector
- **mask** (batch, time): Tensor containing indices to be masked
Returns:
- **outputs** (batch, time): Tensor produced by last linear layer.
"""
def __init__(
self, channels_in: int, channels: int, channels_out: int, kernel_size: int, p_dropout: float, lrelu_slope: float
):
super().__init__()
self.layers = nn.ModuleList(
[
ConvTransposed(
channels_in,
channels,
kernel_size=kernel_size,
padding=(kernel_size - 1) // 2,
),
nn.LeakyReLU(lrelu_slope),
nn.LayerNorm(channels),
nn.Dropout(p_dropout),
ConvTransposed(
channels,
channels,
kernel_size=kernel_size,
padding=(kernel_size - 1) // 2,
),
nn.LeakyReLU(lrelu_slope),
nn.LayerNorm(channels),
nn.Dropout(p_dropout),
]
)
self.linear_layer = nn.Linear(channels, channels_out)
def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
"""
Shapes:
x: :math: `[B, T_src, C]`
mask: :math: `[B, T_src]`
"""
for layer in self.layers:
x = layer(x)
x = self.linear_layer(x)
x = x.squeeze(-1)
x = x.masked_fill(mask, 0.0)
return x

View File

@ -81,7 +81,6 @@ class RelativePositionTransformerDecoder(nn.Module):
"""
def __init__(self, in_channels, out_channels, hidden_channels, params):
super().__init__()
self.prenet = Conv1dBN(in_channels, hidden_channels, 1, 1)
self.rel_pos_transformer = RelativePositionTransformer(in_channels, out_channels, hidden_channels, **params)
@ -111,7 +110,6 @@ class FFTransformerDecoder(nn.Module):
"""
def __init__(self, in_channels, out_channels, params):
super().__init__()
self.transformer_block = FFTransformerBlock(in_channels, **params)
self.postnet = nn.Conv1d(in_channels, out_channels, 1)

View File

@ -18,7 +18,6 @@ class DurationPredictor(nn.Module):
"""
def __init__(self, hidden_channels):
super().__init__()
self.layers = nn.ModuleList(

View File

@ -57,6 +57,15 @@ class AlignmentNetwork(torch.nn.Module):
nn.Conv1d(in_query_channels, attn_channels, kernel_size=1, padding=0, bias=True),
)
self.init_layers()
def init_layers(self):
torch.nn.init.xavier_uniform_(self.key_layer[0].weight, gain=torch.nn.init.calculate_gain("relu"))
torch.nn.init.xavier_uniform_(self.key_layer[2].weight, gain=torch.nn.init.calculate_gain("linear"))
torch.nn.init.xavier_uniform_(self.query_layer[0].weight, gain=torch.nn.init.calculate_gain("relu"))
torch.nn.init.xavier_uniform_(self.query_layer[2].weight, gain=torch.nn.init.calculate_gain("linear"))
torch.nn.init.xavier_uniform_(self.query_layer[4].weight, gain=torch.nn.init.calculate_gain("linear"))
def forward(
self, queries: torch.tensor, keys: torch.tensor, mask: torch.tensor = None, attn_prior: torch.tensor = None
) -> Tuple[torch.tensor, torch.tensor]:
@ -75,7 +84,9 @@ class AlignmentNetwork(torch.nn.Module):
attn_logp = -self.temperature * attn_factor.sum(1, keepdim=True)
if attn_prior is not None:
attn_logp = self.log_softmax(attn_logp) + torch.log(attn_prior[:, None] + 1e-8)
if mask is not None:
attn_logp.data.masked_fill_(~mask.bool().unsqueeze(2), -float("inf"))
attn = self.softmax(attn_logp)
return attn, attn_logp

View File

@ -100,7 +100,6 @@ class ResidualConv1dBNBlock(nn.Module):
def __init__(
self, in_channels, out_channels, hidden_channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2
):
super().__init__()
assert len(dilations) == num_res_blocks
self.res_blocks = nn.ModuleList()

View File

@ -1,5 +1,6 @@
import torch
from torch import nn
from torch.nn.utils import parametrize
@torch.jit.script
@ -62,7 +63,7 @@ class WN(torch.nn.Module):
# init conditioning layer
if c_in_channels > 0:
cond_layer = torch.nn.Conv1d(c_in_channels, 2 * hidden_channels * num_layers, 1)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
self.cond_layer = torch.nn.utils.parametrizations.weight_norm(cond_layer, name="weight")
# intermediate layers
for i in range(num_layers):
dilation = dilation_rate**i
@ -75,7 +76,7 @@ class WN(torch.nn.Module):
in_layer = torch.nn.Conv1d(
hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding
)
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight")
self.in_layers.append(in_layer)
if i < num_layers - 1:
@ -84,7 +85,7 @@ class WN(torch.nn.Module):
res_skip_channels = hidden_channels
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight")
self.res_skip_layers.append(res_skip_layer)
# setup weight norm
if not weight_norm:
@ -115,11 +116,11 @@ class WN(torch.nn.Module):
def remove_weight_norm(self):
if self.c_in_channels != 0:
torch.nn.utils.remove_weight_norm(self.cond_layer)
parametrize.remove_parametrizations(self.cond_layer, "weight")
for l in self.in_layers:
torch.nn.utils.remove_weight_norm(l)
parametrize.remove_parametrizations(l, "weight")
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
parametrize.remove_parametrizations(l, "weight")
class WNBlocks(nn.Module):
@ -153,7 +154,6 @@ class WNBlocks(nn.Module):
dropout_p=0,
weight_norm=True,
):
super().__init__()
self.wn_blocks = nn.ModuleList()
for idx in range(num_blocks):

View File

@ -1,6 +1,5 @@
from distutils.version import LooseVersion
import torch
from packaging.version import Version
from torch import nn
from torch.nn import functional as F
@ -91,7 +90,7 @@ class InvConvNear(nn.Module):
self.no_jacobian = no_jacobian
self.weight_inv = None
if LooseVersion(torch.__version__) < LooseVersion("1.9"):
if Version(torch.__version__) < Version("1.9"):
w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0]
else:
w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0]
@ -187,7 +186,7 @@ class CouplingBlock(nn.Module):
self.sigmoid_scale = sigmoid_scale
# input layer
start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1)
start = torch.nn.utils.weight_norm(start)
start = torch.nn.utils.parametrizations.weight_norm(start)
self.start = start
# output layer
# Initializing last layer to 0 makes the affine coupling layers

View File

@ -64,7 +64,6 @@ class RelativePositionMultiHeadAttention(nn.Module):
proximal_bias=False,
proximal_init=False,
):
super().__init__()
assert channels % num_heads == 0, " [!] channels should be divisible by num_heads."
# class attributes
@ -272,7 +271,6 @@ class FeedForwardNetwork(nn.Module):
"""
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dropout_p=0.0, causal=False):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels

View File

@ -165,7 +165,7 @@ class BCELossMasked(nn.Module):
def __init__(self, pos_weight: float = None):
super().__init__()
self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False)
self.register_buffer("pos_weight", torch.tensor([pos_weight]))
def forward(self, x, target, length):
"""
@ -191,16 +191,21 @@ class BCELossMasked(nn.Module):
mask = sequence_mask(sequence_length=length, max_len=target.size(1))
num_items = mask.sum()
loss = functional.binary_cross_entropy_with_logits(
x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum"
x.masked_select(mask),
target.masked_select(mask),
pos_weight=self.pos_weight.to(x.device),
reduction="sum",
)
else:
loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
loss = functional.binary_cross_entropy_with_logits(
x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum"
)
num_items = torch.numel(x)
loss = loss / num_items
return loss
class DifferentailSpectralLoss(nn.Module):
class DifferentialSpectralLoss(nn.Module):
"""Differential Spectral Loss
https://arxiv.org/ftp/arxiv/papers/1909/1909.10302.pdf"""
@ -335,7 +340,7 @@ class TacotronLoss(torch.nn.Module):
self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma)
# differential spectral loss
if c.postnet_diff_spec_alpha > 0 or c.decoder_diff_spec_alpha > 0:
self.criterion_diff_spec = DifferentailSpectralLoss(loss_func=self.criterion)
self.criterion_diff_spec = DifferentialSpectralLoss(loss_func=self.criterion)
# ssim loss
if c.postnet_ssim_alpha > 0 or c.decoder_ssim_alpha > 0:
self.criterion_ssim = SSIMLoss()
@ -363,7 +368,6 @@ class TacotronLoss(torch.nn.Module):
alignments_backwards,
input_lens,
):
# decoder outputs linear or mel spectrograms for Tacotron and Tacotron2
# the target should be set acccordingly
postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input
@ -801,6 +805,10 @@ class ForwardTTSLoss(nn.Module):
self.pitch_loss = MSELossMasked(False)
self.pitch_loss_alpha = c.pitch_loss_alpha
if c.model_args.use_energy:
self.energy_loss = MSELossMasked(False)
self.energy_loss_alpha = c.energy_loss_alpha
if c.use_ssim_loss:
self.ssim = SSIMLoss() if c.use_ssim_loss else None
self.ssim_loss_alpha = c.ssim_loss_alpha
@ -826,6 +834,8 @@ class ForwardTTSLoss(nn.Module):
dur_target,
pitch_output,
pitch_target,
energy_output,
energy_target,
input_lens,
alignment_logprob=None,
alignment_hard=None,
@ -855,6 +865,11 @@ class ForwardTTSLoss(nn.Module):
loss = loss + self.pitch_loss_alpha * pitch_loss
return_dict["loss_pitch"] = self.pitch_loss_alpha * pitch_loss
if hasattr(self, "energy_loss") and self.energy_loss_alpha > 0:
energy_loss = self.energy_loss(energy_output.transpose(1, 2), energy_target.transpose(1, 2), input_lens)
loss = loss + self.energy_loss_alpha * energy_loss
return_dict["loss_energy"] = self.energy_loss_alpha * energy_loss
if hasattr(self, "aligner_loss") and self.aligner_loss_alpha > 0:
aligner_loss = self.aligner_loss(alignment_logprob, input_lens, decoder_output_lens)
loss = loss + self.aligner_loss_alpha * aligner_loss

View File

View File

@ -0,0 +1,323 @@
from typing import List, Tuple
import torch
import torch.nn.functional as F
from torch import nn
from tqdm.auto import tqdm
from TTS.tts.layers.tacotron.common_layers import Linear
from TTS.tts.layers.tacotron.tacotron2 import ConvBNBlock
class Encoder(nn.Module):
r"""Neural HMM Encoder
Same as Tacotron 2 encoder but increases the input length by states per phone
Args:
num_chars (int): Number of characters in the input.
state_per_phone (int): Number of states per phone.
in_out_channels (int): number of input and output channels.
n_convolutions (int): number of convolutional layers.
"""
def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutions=3):
super().__init__()
self.state_per_phone = state_per_phone
self.in_out_channels = in_out_channels
self.emb = nn.Embedding(num_chars, in_out_channels)
self.convolutions = nn.ModuleList()
for _ in range(n_convolutions):
self.convolutions.append(ConvBNBlock(in_out_channels, in_out_channels, 5, "relu"))
self.lstm = nn.LSTM(
in_out_channels,
int(in_out_channels / 2) * state_per_phone,
num_layers=1,
batch_first=True,
bias=True,
bidirectional=True,
)
self.rnn_state = None
def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> Tuple[torch.FloatTensor, torch.LongTensor]:
"""Forward pass to the encoder.
Args:
x (torch.FloatTensor): input text indices.
- shape: :math:`(b, T_{in})`
x_len (torch.LongTensor): input text lengths.
- shape: :math:`(b,)`
Returns:
Tuple[torch.FloatTensor, torch.LongTensor]: encoder outputs and output lengths.
-shape: :math:`((b, T_{in} * states_per_phone, in_out_channels), (b,))`
"""
b, T = x.shape
o = self.emb(x).transpose(1, 2)
for layer in self.convolutions:
o = layer(o)
o = o.transpose(1, 2)
o = nn.utils.rnn.pack_padded_sequence(o, x_len.cpu(), batch_first=True)
self.lstm.flatten_parameters()
o, _ = self.lstm(o)
o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True)
o = o.reshape(b, T * self.state_per_phone, self.in_out_channels)
x_len = x_len * self.state_per_phone
return o, x_len
def inference(self, x, x_len):
"""Inference to the encoder.
Args:
x (torch.FloatTensor): input text indices.
- shape: :math:`(b, T_{in})`
x_len (torch.LongTensor): input text lengths.
- shape: :math:`(b,)`
Returns:
Tuple[torch.FloatTensor, torch.LongTensor]: encoder outputs and output lengths.
-shape: :math:`((b, T_{in} * states_per_phone, in_out_channels), (b,))`
"""
b, T = x.shape
o = self.emb(x).transpose(1, 2)
for layer in self.convolutions:
o = layer(o)
o = o.transpose(1, 2)
# self.lstm.flatten_parameters()
o, _ = self.lstm(o)
o = o.reshape(b, T * self.state_per_phone, self.in_out_channels)
x_len = x_len * self.state_per_phone
return o, x_len
class ParameterModel(nn.Module):
r"""Main neural network of the outputnet
Note: Do not put dropout layers here, the model will not converge.
Args:
outputnet_size (List[int]): the architecture of the parameter model
input_size (int): size of input for the first layer
output_size (int): size of output i.e size of the feature dim
frame_channels (int): feature dim to set the flat start bias
flat_start_params (dict): flat start parameters to set the bias
"""
def __init__(
self,
outputnet_size: List[int],
input_size: int,
output_size: int,
frame_channels: int,
flat_start_params: dict,
):
super().__init__()
self.frame_channels = frame_channels
self.layers = nn.ModuleList(
[Linear(inp, out) for inp, out in zip([input_size] + outputnet_size[:-1], outputnet_size)]
)
self.last_layer = nn.Linear(outputnet_size[-1], output_size)
self.flat_start_output_layer(
flat_start_params["mean"], flat_start_params["std"], flat_start_params["transition_p"]
)
def flat_start_output_layer(self, mean, std, transition_p):
self.last_layer.weight.data.zero_()
self.last_layer.bias.data[0 : self.frame_channels] = mean
self.last_layer.bias.data[self.frame_channels : 2 * self.frame_channels] = OverflowUtils.inverse_softplus(std)
self.last_layer.bias.data[2 * self.frame_channels :] = OverflowUtils.inverse_sigmod(transition_p)
def forward(self, x):
for layer in self.layers:
x = F.relu(layer(x))
x = self.last_layer(x)
return x
class Outputnet(nn.Module):
r"""
This network takes current state and previous observed values as input
and returns its parameters, mean, standard deviation and probability
of transition to the next state
"""
def __init__(
self,
encoder_dim: int,
memory_rnn_dim: int,
frame_channels: int,
outputnet_size: List[int],
flat_start_params: dict,
std_floor: float = 1e-2,
):
super().__init__()
self.frame_channels = frame_channels
self.flat_start_params = flat_start_params
self.std_floor = std_floor
input_size = memory_rnn_dim + encoder_dim
output_size = 2 * frame_channels + 1
self.parametermodel = ParameterModel(
outputnet_size=outputnet_size,
input_size=input_size,
output_size=output_size,
flat_start_params=flat_start_params,
frame_channels=frame_channels,
)
def forward(self, ar_mels, inputs):
r"""Inputs observation and returns the means, stds and transition probability for the current state
Args:
ar_mel_inputs (torch.FloatTensor): shape (batch, prenet_dim)
states (torch.FloatTensor): (batch, hidden_states, hidden_state_dim)
Returns:
means: means for the emission observation for each feature
- shape: (B, hidden_states, feature_size)
stds: standard deviations for the emission observation for each feature
- shape: (batch, hidden_states, feature_size)
transition_vectors: transition vector for the current hidden state
- shape: (batch, hidden_states)
"""
batch_size, prenet_dim = ar_mels.shape[0], ar_mels.shape[1]
N = inputs.shape[1]
ar_mels = ar_mels.unsqueeze(1).expand(batch_size, N, prenet_dim)
ar_mels = torch.cat((ar_mels, inputs), dim=2)
ar_mels = self.parametermodel(ar_mels)
mean, std, transition_vector = (
ar_mels[:, :, 0 : self.frame_channels],
ar_mels[:, :, self.frame_channels : 2 * self.frame_channels],
ar_mels[:, :, 2 * self.frame_channels :].squeeze(2),
)
std = F.softplus(std)
std = self._floor_std(std)
return mean, std, transition_vector
def _floor_std(self, std):
r"""
It clamps the standard deviation to not to go below some level
This removes the problem when the model tries to cheat for higher likelihoods by converting
one of the gaussians to a point mass.
Args:
std (float Tensor): tensor containing the standard deviation to be
"""
original_tensor = std.clone().detach()
std = torch.clamp(std, min=self.std_floor)
if torch.any(original_tensor != std):
print(
"[*] Standard deviation was floored! The model is preventing overfitting, nothing serious to worry about"
)
return std
class OverflowUtils:
@staticmethod
def get_data_parameters_for_flat_start(
data_loader: torch.utils.data.DataLoader, out_channels: int, states_per_phone: int
):
"""Generates data parameters for flat starting the HMM.
Args:
data_loader (torch.utils.data.Dataloader): _description_
out_channels (int): mel spectrogram channels
states_per_phone (_type_): HMM states per phone
"""
# State related information for transition_p
total_state_len = 0
total_mel_len = 0
# Useful for data mean an std
total_mel_sum = 0
total_mel_sq_sum = 0
for batch in tqdm(data_loader, leave=False):
text_lengths = batch["token_id_lengths"]
mels = batch["mel"]
mel_lengths = batch["mel_lengths"]
total_state_len += torch.sum(text_lengths)
total_mel_len += torch.sum(mel_lengths)
total_mel_sum += torch.sum(mels)
total_mel_sq_sum += torch.sum(torch.pow(mels, 2))
data_mean = total_mel_sum / (total_mel_len * out_channels)
data_std = torch.sqrt((total_mel_sq_sum / (total_mel_len * out_channels)) - torch.pow(data_mean, 2))
average_num_states = total_state_len / len(data_loader.dataset)
average_mel_len = total_mel_len / len(data_loader.dataset)
average_duration_each_state = average_mel_len / average_num_states
init_transition_prob = 1 / average_duration_each_state
return data_mean, data_std, (init_transition_prob * states_per_phone)
@staticmethod
@torch.no_grad()
def update_flat_start_transition(model, transition_p):
model.neural_hmm.output_net.parametermodel.flat_start_output_layer(0.0, 1.0, transition_p)
@staticmethod
def log_clamped(x, eps=1e-04):
"""
Avoids the log(0) problem
Args:
x (torch.tensor): input tensor
eps (float, optional): lower bound. Defaults to 1e-04.
Returns:
torch.tensor: :math:`log(x)`
"""
clamped_x = torch.clamp(x, min=eps)
return torch.log(clamped_x)
@staticmethod
def inverse_sigmod(x):
r"""
Inverse of the sigmoid function
"""
if not torch.is_tensor(x):
x = torch.tensor(x)
return OverflowUtils.log_clamped(x / (1.0 - x))
@staticmethod
def inverse_softplus(x):
r"""
Inverse of the softplus function
"""
if not torch.is_tensor(x):
x = torch.tensor(x)
return OverflowUtils.log_clamped(torch.exp(x) - 1.0)
@staticmethod
def logsumexp(x, dim):
r"""
Differentiable LogSumExp: Does not creates nan gradients
when all the inputs are -inf yeilds 0 gradients.
Args:
x : torch.Tensor - The input tensor
dim: int - The dimension on which the log sum exp has to be applied
"""
m, _ = x.max(dim=dim)
mask = m == -float("inf")
s = (x - m.masked_fill_(mask, 0).unsqueeze(dim=dim)).exp().sum(dim=dim)
return s.masked_fill_(mask, 1).log() + m.masked_fill_(mask, -float("inf"))
@staticmethod
def double_pad(list_of_different_shape_tensors):
r"""
Pads the list of tensors in 2 dimensions
"""
second_dim_lens = [len(a) for a in [i[0] for i in list_of_different_shape_tensors]]
second_dim_max = max(second_dim_lens)
padded_x = [F.pad(x, (0, second_dim_max - len(x[0]))) for x in list_of_different_shape_tensors]
return nn.utils.rnn.pad_sequence(padded_x, batch_first=True)

View File

@ -0,0 +1,81 @@
import torch
from torch import nn
from TTS.tts.layers.glow_tts.decoder import Decoder as GlowDecoder
from TTS.tts.utils.helpers import sequence_mask
class Decoder(nn.Module):
"""Uses glow decoder with some modifications.
::
Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze
Args:
in_channels (int): channels of input tensor.
hidden_channels (int): hidden decoder channels.
kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)
dilation_rate (int): rate to increase dilation by each layer in a decoder block.
num_flow_blocks (int): number of decoder blocks.
num_coupling_layers (int): number coupling layers. (number of wavenet layers.)
dropout_p (float): wavenet dropout rate.
sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer.
"""
def __init__(
self,
in_channels,
hidden_channels,
kernel_size,
dilation_rate,
num_flow_blocks,
num_coupling_layers,
dropout_p=0.0,
num_splits=4,
num_squeeze=2,
sigmoid_scale=False,
c_in_channels=0,
):
super().__init__()
self.glow_decoder = GlowDecoder(
in_channels,
hidden_channels,
kernel_size,
dilation_rate,
num_flow_blocks,
num_coupling_layers,
dropout_p,
num_splits,
num_squeeze,
sigmoid_scale,
c_in_channels,
)
self.n_sqz = num_squeeze
def forward(self, x, x_len, g=None, reverse=False):
"""
Input shapes:
- x: :math:`[B, C, T]`
- x_len :math:`[B]`
- g: :math:`[B, C]`
Output shapes:
- x: :math:`[B, C, T]`
- x_len :math:`[B]`
- logget_tot :math:`[B]`
"""
x, x_len, x_max_len = self.preprocess(x, x_len, x_len.max())
x_mask = torch.unsqueeze(sequence_mask(x_len, x_max_len), 1).to(x.dtype)
x, logdet_tot = self.glow_decoder(x, x_mask, g, reverse)
return x, x_len, logdet_tot
def preprocess(self, y, y_lengths, y_max_length):
if y_max_length is not None:
y_max_length = torch.div(y_max_length, self.n_sqz, rounding_mode="floor") * self.n_sqz
y = y[:, :, :y_max_length]
y_lengths = torch.div(y_lengths, self.n_sqz, rounding_mode="floor") * self.n_sqz
return y, y_lengths, y_max_length
def store_inverse(self):
self.glow_decoder.store_inverse()

View File

@ -0,0 +1,553 @@
from typing import List
import torch
import torch.distributions as tdist
import torch.nn.functional as F
from torch import nn
from torch.utils.checkpoint import checkpoint
from TTS.tts.layers.overflow.common_layers import Outputnet, OverflowUtils
from TTS.tts.layers.tacotron.common_layers import Prenet
from TTS.tts.utils.helpers import sequence_mask
class NeuralHMM(nn.Module):
"""Autoregressive left to right HMM model primarily used in "Neural HMMs are all you need (for high-quality attention-free TTS)"
Paper::
https://arxiv.org/abs/2108.13320
Paper abstract::
Neural sequence-to-sequence TTS has achieved significantly better output quality than statistical speech synthesis using
HMMs. However, neural TTS is generally not probabilistic and uses non-monotonic attention. Attention failures increase
training time and can make synthesis babble incoherently. This paper describes how the old and new paradigms can be
combined to obtain the advantages of both worlds, by replacing attention in neural TTS with an autoregressive left-right
no-skip hidden Markov model defined by a neural network. Based on this proposal, we modify Tacotron 2 to obtain an
HMM-based neural TTS model with monotonic alignment, trained to maximise the full sequence likelihood without
approximation. We also describe how to combine ideas from classical and contemporary TTS for best results. The resulting
example system is smaller and simpler than Tacotron 2, and learns to speak with fewer iterations and less data, whilst
achieving comparable naturalness prior to the post-net. Our approach also allows easy control over speaking rate.
Args:
frame_channels (int): Output dimension to generate.
ar_order (int): Autoregressive order of the model. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
deterministic_transition (bool): deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
encoder_dim (int): Channels of encoder input and character embedding tensors. Defaults to 512.
prenet_type (str): `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the Prenet.
prenet_dim (int): Dimension of the Prenet.
prenet_n_layers (int): Number of layers in the Prenet.
prenet_dropout (float): Dropout probability of the Prenet.
prenet_dropout_at_inference (bool): If True, dropout is applied at inference time.
memory_rnn_dim (int): Size of the memory RNN to process output of prenet.
outputnet_size (List[int]): Size of the output network inside the neural HMM.
flat_start_params (dict): Parameters for the flat start initialization of the neural HMM.
std_floor (float): Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint.
use_grad_checkpointing (bool, optional): Use gradient checkpointing to save memory. Defaults to True.
"""
def __init__(
self,
frame_channels: int,
ar_order: int,
deterministic_transition: bool,
encoder_dim: int,
prenet_type: str,
prenet_dim: int,
prenet_n_layers: int,
prenet_dropout: float,
prenet_dropout_at_inference: bool,
memory_rnn_dim: int,
outputnet_size: List[int],
flat_start_params: dict,
std_floor: float,
use_grad_checkpointing: bool = True,
):
super().__init__()
self.frame_channels = frame_channels
self.ar_order = ar_order
self.deterministic_transition = deterministic_transition
self.prenet_dim = prenet_dim
self.memory_rnn_dim = memory_rnn_dim
self.use_grad_checkpointing = use_grad_checkpointing
self.transition_model = TransitionModel()
self.emission_model = EmissionModel()
assert ar_order > 0, f"AR order must be greater than 0 provided {ar_order}"
self.ar_order = ar_order
self.prenet = Prenet(
in_features=frame_channels * ar_order,
prenet_type=prenet_type,
prenet_dropout=prenet_dropout,
dropout_at_inference=prenet_dropout_at_inference,
out_features=[self.prenet_dim for _ in range(prenet_n_layers)],
bias=False,
)
self.memory_rnn = nn.LSTMCell(input_size=prenet_dim, hidden_size=memory_rnn_dim)
self.output_net = Outputnet(
encoder_dim, memory_rnn_dim, frame_channels, outputnet_size, flat_start_params, std_floor
)
self.register_buffer("go_tokens", torch.zeros(ar_order, 1))
def forward(self, inputs, inputs_len, mels, mel_lens):
r"""HMM forward algorithm for training uses logarithmic version of Rabiner (1989) forward algorithm.
Args:
inputs (torch.FloatTensor): Encoder outputs
inputs_len (torch.LongTensor): Encoder output lengths
mels (torch.FloatTensor): Mel inputs
mel_lens (torch.LongTensor): Length of mel inputs
Shapes:
- inputs: (B, T, D_out_enc)
- inputs_len: (B)
- mels: (B, D_mel, T_mel)
- mel_lens: (B)
Returns:
log_prob (torch.FloatTensor): Log probability of the sequence
"""
# Get dimensions of inputs
batch_size, N, _ = inputs.shape
T_max = torch.max(mel_lens)
mels = mels.permute(0, 2, 1)
# Intialize forward algorithm
log_state_priors = self._initialize_log_state_priors(inputs)
log_c, log_alpha_scaled, transition_matrix, means = self._initialize_forward_algorithm_variables(mels, N)
# Initialize autoregression elements
ar_inputs = self._add_go_token(mels)
h_memory, c_memory = self._init_lstm_states(batch_size, self.memory_rnn_dim, mels)
for t in range(T_max):
# Process Autoregression
h_memory, c_memory = self._process_ar_timestep(t, ar_inputs, h_memory, c_memory)
# Get mean, std and transition vector from decoder for this timestep
# Note: Gradient checkpointing currently doesn't works with multiple gpus inside a loop
if self.use_grad_checkpointing and self.training:
mean, std, transition_vector = checkpoint(self.output_net, h_memory, inputs)
else:
mean, std, transition_vector = self.output_net(h_memory, inputs)
if t == 0:
log_alpha_temp = log_state_priors + self.emission_model(mels[:, 0], mean, std, inputs_len)
else:
log_alpha_temp = self.emission_model(mels[:, t], mean, std, inputs_len) + self.transition_model(
log_alpha_scaled[:, t - 1, :], transition_vector, inputs_len
)
log_c[:, t] = torch.logsumexp(log_alpha_temp, dim=1)
log_alpha_scaled[:, t, :] = log_alpha_temp - log_c[:, t].unsqueeze(1)
transition_matrix[:, t] = transition_vector # needed for absorption state calculation
# Save for plotting
means.append(mean.detach())
log_c, log_alpha_scaled = self._mask_lengths(mel_lens, log_c, log_alpha_scaled)
sum_final_log_c = self.get_absorption_state_scaling_factor(
mel_lens, log_alpha_scaled, inputs_len, transition_matrix
)
log_probs = torch.sum(log_c, dim=1) + sum_final_log_c
return log_probs, log_alpha_scaled, transition_matrix, means
@staticmethod
def _mask_lengths(mel_lens, log_c, log_alpha_scaled):
"""
Mask the lengths of the forward variables so that the variable lenghts
do not contribute in the loss calculation
Args:
mel_inputs (torch.FloatTensor): (batch, T, frame_channels)
mel_inputs_lengths (torch.IntTensor): (batch)
log_c (torch.FloatTensor): (batch, T)
Returns:
log_c (torch.FloatTensor) : scaled probabilities (batch, T)
log_alpha_scaled (torch.FloatTensor): forward probabilities (batch, T, N)
"""
mask_log_c = sequence_mask(mel_lens)
log_c = log_c * mask_log_c
mask_log_alpha_scaled = mask_log_c.unsqueeze(2)
log_alpha_scaled = log_alpha_scaled * mask_log_alpha_scaled
return log_c, log_alpha_scaled
def _process_ar_timestep(
self,
t,
ar_inputs,
h_memory,
c_memory,
):
"""
Process autoregression in timestep
1. At a specific t timestep
2. Perform data dropout if applied (we did not use it)
3. Run the autoregressive frame through the prenet (has dropout)
4. Run the prenet output through the post prenet rnn
Args:
t (int): mel-spec timestep
ar_inputs (torch.FloatTensor): go-token appended mel-spectrograms
- shape: (b, D_out, T_out)
h_post_prenet (torch.FloatTensor): previous timestep rnn hidden state
- shape: (b, memory_rnn_dim)
c_post_prenet (torch.FloatTensor): previous timestep rnn cell state
- shape: (b, memory_rnn_dim)
Returns:
h_post_prenet (torch.FloatTensor): rnn hidden state of the current timestep
c_post_prenet (torch.FloatTensor): rnn cell state of the current timestep
"""
prenet_input = ar_inputs[:, t : t + self.ar_order].flatten(1)
memory_inputs = self.prenet(prenet_input)
h_memory, c_memory = self.memory_rnn(memory_inputs, (h_memory, c_memory))
return h_memory, c_memory
def _add_go_token(self, mel_inputs):
"""Append the go token to create the autoregressive input
Args:
mel_inputs (torch.FloatTensor): (batch_size, T, n_mel_channel)
Returns:
ar_inputs (torch.FloatTensor): (batch_size, T, n_mel_channel)
"""
batch_size, T, _ = mel_inputs.shape
go_tokens = self.go_tokens.unsqueeze(0).expand(batch_size, self.ar_order, self.frame_channels)
ar_inputs = torch.cat((go_tokens, mel_inputs), dim=1)[:, :T]
return ar_inputs
@staticmethod
def _initialize_forward_algorithm_variables(mel_inputs, N):
r"""Initialize placeholders for forward algorithm variables, to use a stable
version we will use log_alpha_scaled and the scaling constant
Args:
mel_inputs (torch.FloatTensor): (b, T_max, frame_channels)
N (int): number of states
Returns:
log_c (torch.FloatTensor): Scaling constant (b, T_max)
"""
b, T_max, _ = mel_inputs.shape
log_alpha_scaled = mel_inputs.new_zeros((b, T_max, N))
log_c = mel_inputs.new_zeros(b, T_max)
transition_matrix = mel_inputs.new_zeros((b, T_max, N))
# Saving for plotting later, will not have gradient tapes
means = []
return log_c, log_alpha_scaled, transition_matrix, means
@staticmethod
def _init_lstm_states(batch_size, hidden_state_dim, device_tensor):
r"""
Initialize Hidden and Cell states for LSTM Cell
Args:
batch_size (Int): batch size
hidden_state_dim (Int): dimensions of the h and c
device_tensor (torch.FloatTensor): useful for the device and type
Returns:
(torch.FloatTensor): shape (batch_size, hidden_state_dim)
can be hidden state for LSTM
(torch.FloatTensor): shape (batch_size, hidden_state_dim)
can be the cell state for LSTM
"""
return (
device_tensor.new_zeros(batch_size, hidden_state_dim),
device_tensor.new_zeros(batch_size, hidden_state_dim),
)
def get_absorption_state_scaling_factor(self, mels_len, log_alpha_scaled, inputs_len, transition_vector):
"""Returns the final scaling factor of absorption state
Args:
mels_len (torch.IntTensor): Input size of mels to
get the last timestep of log_alpha_scaled
log_alpha_scaled (torch.FloatTEnsor): State probabilities
text_lengths (torch.IntTensor): length of the states to
mask the values of states lengths
(
Useful when the batch has very different lengths,
when the length of an observation is less than
the number of max states, then the log alpha after
the state value is filled with -infs. So we mask
those values so that it only consider the states
which are needed for that length
)
transition_vector (torch.FloatTensor): transtiion vector for each state per timestep
Shapes:
- mels_len: (batch_size)
- log_alpha_scaled: (batch_size, N, T)
- text_lengths: (batch_size)
- transition_vector: (batch_size, N, T)
Returns:
sum_final_log_c (torch.FloatTensor): (batch_size)
"""
N = torch.max(inputs_len)
max_inputs_len = log_alpha_scaled.shape[2]
state_lengths_mask = sequence_mask(inputs_len, max_len=max_inputs_len)
last_log_alpha_scaled_index = (
(mels_len - 1).unsqueeze(-1).expand(-1, N).unsqueeze(1)
) # Batch X Hidden State Size
last_log_alpha_scaled = torch.gather(log_alpha_scaled, 1, last_log_alpha_scaled_index).squeeze(1)
last_log_alpha_scaled = last_log_alpha_scaled.masked_fill(~state_lengths_mask, -float("inf"))
last_transition_vector = torch.gather(transition_vector, 1, last_log_alpha_scaled_index).squeeze(1)
last_transition_probability = torch.sigmoid(last_transition_vector)
log_probability_of_transitioning = OverflowUtils.log_clamped(last_transition_probability)
last_transition_probability_index = self.get_mask_for_last_item(inputs_len, inputs_len.device)
log_probability_of_transitioning = log_probability_of_transitioning.masked_fill(
~last_transition_probability_index, -float("inf")
)
final_log_c = last_log_alpha_scaled + log_probability_of_transitioning
# If the length of the mel is less than the number of states it will select the -inf values leading to nan gradients
# Ideally, we should clean the dataset otherwise this is a little hack uncomment the line below
final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
sum_final_log_c = torch.logsumexp(final_log_c, dim=1)
return sum_final_log_c
@staticmethod
def get_mask_for_last_item(lengths, device, out_tensor=None):
"""Returns n-1 mask for the last item in the sequence.
Args:
lengths (torch.IntTensor): lengths in a batch
device (str, optional): Defaults to "cpu".
out_tensor (torch.Tensor, optional): uses the memory of a specific tensor.
Defaults to None.
Returns:
- Shape: :math:`(b, max_len)`
"""
max_len = torch.max(lengths).item()
ids = (
torch.arange(0, max_len, device=device) if out_tensor is None else torch.arange(0, max_len, out=out_tensor)
)
mask = ids == lengths.unsqueeze(1) - 1
return mask
@torch.inference_mode()
def inference(
self,
inputs: torch.FloatTensor,
input_lens: torch.LongTensor,
sampling_temp: float,
max_sampling_time: int,
duration_threshold: float,
):
"""Inference from autoregressive neural HMM
Args:
inputs (torch.FloatTensor): input states
- shape: :math:`(b, T, d)`
input_lens (torch.LongTensor): input state lengths
- shape: :math:`(b)`
sampling_temp (float): sampling temperature
max_sampling_temp (int): max sampling temperature
duration_threshold (float): duration threshold to switch to next state
- Use this to change the spearking rate of the synthesised audio
"""
b = inputs.shape[0]
outputs = {
"hmm_outputs": [],
"hmm_outputs_len": [],
"alignments": [],
"input_parameters": [],
"output_parameters": [],
}
for i in range(b):
neural_hmm_outputs, states_travelled, input_parameters, output_parameters = self.sample(
inputs[i : i + 1], input_lens[i], sampling_temp, max_sampling_time, duration_threshold
)
outputs["hmm_outputs"].append(neural_hmm_outputs)
outputs["hmm_outputs_len"].append(neural_hmm_outputs.shape[0])
outputs["alignments"].append(states_travelled)
outputs["input_parameters"].append(input_parameters)
outputs["output_parameters"].append(output_parameters)
outputs["hmm_outputs"] = nn.utils.rnn.pad_sequence(outputs["hmm_outputs"], batch_first=True)
outputs["hmm_outputs_len"] = torch.tensor(
outputs["hmm_outputs_len"], dtype=input_lens.dtype, device=input_lens.device
)
return outputs
@torch.inference_mode()
def sample(self, inputs, input_lens, sampling_temp, max_sampling_time, duration_threshold):
"""Samples an output from the parameter models
Args:
inputs (torch.FloatTensor): input states
- shape: :math:`(1, T, d)`
input_lens (torch.LongTensor): input state lengths
- shape: :math:`(1)`
sampling_temp (float): sampling temperature
max_sampling_time (int): max sampling time
duration_threshold (float): duration threshold to switch to next state
Returns:
outputs (torch.FloatTensor): Output Observations
- Shape: :math:`(T, output_dim)`
states_travelled (list[int]): Hidden states travelled
- Shape: :math:`(T)`
input_parameters (list[torch.FloatTensor]): Input parameters
output_parameters (list[torch.FloatTensor]): Output parameters
"""
states_travelled, outputs, t = [], [], 0
# Sample initial state
current_state = 0
states_travelled.append(current_state)
# Prepare autoregression
prenet_input = self.go_tokens.unsqueeze(0).expand(1, self.ar_order, self.frame_channels)
h_memory, c_memory = self._init_lstm_states(1, self.memory_rnn_dim, prenet_input)
input_parameter_values = []
output_parameter_values = []
quantile = 1
while True:
memory_input = self.prenet(prenet_input.flatten(1).unsqueeze(0))
# will be 1 while sampling
h_memory, c_memory = self.memory_rnn(memory_input.squeeze(0), (h_memory, c_memory))
z_t = inputs[:, current_state].unsqueeze(0) # Add fake time dimension
mean, std, transition_vector = self.output_net(h_memory, z_t)
transition_probability = torch.sigmoid(transition_vector.flatten())
staying_probability = torch.sigmoid(-transition_vector.flatten())
# Save for plotting
input_parameter_values.append([prenet_input, current_state])
output_parameter_values.append([mean, std, transition_probability])
x_t = self.emission_model.sample(mean, std, sampling_temp=sampling_temp)
# Prepare autoregressive input for next iteration
prenet_input = torch.cat((prenet_input, x_t), dim=1)[:, 1:]
outputs.append(x_t.flatten())
transition_matrix = torch.cat((staying_probability, transition_probability))
quantile *= staying_probability
if not self.deterministic_transition:
switch = transition_matrix.multinomial(1)[0].item()
else:
switch = quantile < duration_threshold
if switch:
current_state += 1
quantile = 1
states_travelled.append(current_state)
if (current_state == input_lens) or (max_sampling_time and t == max_sampling_time - 1):
break
t += 1
return (
torch.stack(outputs, dim=0),
F.one_hot(input_lens.new_tensor(states_travelled)),
input_parameter_values,
output_parameter_values,
)
@staticmethod
def _initialize_log_state_priors(text_embeddings):
"""Creates the log pi in forward algorithm.
Args:
text_embeddings (torch.FloatTensor): used to create the log pi
on current device
Shapes:
- text_embeddings: (B, T, D_out_enc)
"""
N = text_embeddings.shape[1]
log_state_priors = text_embeddings.new_full([N], -float("inf"))
log_state_priors[0] = 0.0
return log_state_priors
class TransitionModel(nn.Module):
"""Transition Model of the HMM, it represents the probability of transitioning
form current state to all other states"""
def forward(self, log_alpha_scaled, transition_vector, inputs_len): # pylint: disable=no-self-use
r"""
product of the past state with transitional probabilities in log space
Args:
log_alpha_scaled (torch.Tensor): Multiply previous timestep's alphas by
transition matrix (in log domain)
- shape: (batch size, N)
transition_vector (torch.tensor): transition vector for each state
- shape: (N)
inputs_len (int tensor): Lengths of states in a batch
- shape: (batch)
Returns:
out (torch.FloatTensor): log probability of transitioning to each state
"""
transition_p = torch.sigmoid(transition_vector)
staying_p = torch.sigmoid(-transition_vector)
log_staying_probability = OverflowUtils.log_clamped(staying_p)
log_transition_probability = OverflowUtils.log_clamped(transition_p)
staying = log_alpha_scaled + log_staying_probability
leaving = log_alpha_scaled + log_transition_probability
leaving = leaving.roll(1, dims=1)
leaving[:, 0] = -float("inf")
inputs_len_mask = sequence_mask(inputs_len)
out = OverflowUtils.logsumexp(torch.stack((staying, leaving), dim=2), dim=2)
out = out.masked_fill(~inputs_len_mask, -float("inf")) # There are no states to contribute to the loss
return out
class EmissionModel(nn.Module):
"""Emission Model of the HMM, it represents the probability of
emitting an observation based on the current state"""
def __init__(self) -> None:
super().__init__()
self.distribution_function: tdist.Distribution = tdist.normal.Normal
def sample(self, means, stds, sampling_temp):
return self.distribution_function(means, stds * sampling_temp).sample() if sampling_temp > 0 else means
def forward(self, x_t, means, stds, state_lengths):
r"""Calculates the log probability of the the given data (x_t)
being observed from states with given means and stds
Args:
x_t (float tensor) : observation at current time step
- shape: (batch, feature_dim)
means (float tensor): means of the distributions of hidden states
- shape: (batch, hidden_state, feature_dim)
stds (float tensor): standard deviations of the distributions of the hidden states
- shape: (batch, hidden_state, feature_dim)
state_lengths (int tensor): Lengths of states in a batch
- shape: (batch)
Returns:
out (float tensor): observation log likelihoods,
expressing the probability of an observation
being generated from a state i
shape: (batch, hidden_state)
"""
emission_dists = self.distribution_function(means, stds)
out = emission_dists.log_prob(x_t.unsqueeze(1))
state_lengths_mask = sequence_mask(state_lengths).unsqueeze(2)
out = torch.sum(out * state_lengths_mask, dim=2)
return out

View File

@ -0,0 +1,79 @@
from typing import Any
import matplotlib.pyplot as plt
import numpy as np
import torch
def validate_numpy_array(value: Any):
r"""
Validates the input and makes sure it returns a numpy array (i.e on CPU)
Args:
value (Any): the input value
Raises:
TypeError: if the value is not a numpy array or torch tensor
Returns:
np.ndarray: numpy array of the value
"""
if isinstance(value, np.ndarray):
pass
elif isinstance(value, list):
value = np.array(value)
elif torch.is_tensor(value):
value = value.cpu().numpy()
else:
raise TypeError("Value must be a numpy array, a torch tensor or a list")
return value
def get_spec_from_most_probable_state(log_alpha_scaled, means, decoder=None):
"""Get the most probable state means from the log_alpha_scaled.
Args:
log_alpha_scaled (torch.Tensor): Log alpha scaled values.
- Shape: :math:`(T, N)`
means (torch.Tensor): Means of the states.
- Shape: :math:`(N, T, D_out)`
decoder (torch.nn.Module): Decoder module to decode the latent to melspectrogram. Defaults to None.
"""
max_state_numbers = torch.max(log_alpha_scaled, dim=1)[1]
max_len = means.shape[0]
n_mel_channels = means.shape[2]
max_state_numbers = max_state_numbers.unsqueeze(1).unsqueeze(1).expand(max_len, 1, n_mel_channels)
means = torch.gather(means, 1, max_state_numbers).squeeze(1).to(log_alpha_scaled.dtype)
if decoder is not None:
mel = (
decoder(means.T.unsqueeze(0), torch.tensor([means.shape[0]], device=means.device), reverse=True)[0]
.squeeze(0)
.T
)
else:
mel = means
return mel
def plot_transition_probabilities_to_numpy(states, transition_probabilities, output_fig=False):
"""Generates trainsition probabilities plot for the states and the probability of transition.
Args:
states (torch.IntTensor): the states
transition_probabilities (torch.FloatTensor): the transition probabilities
"""
states = validate_numpy_array(states)
transition_probabilities = validate_numpy_array(transition_probabilities)
fig, ax = plt.subplots(figsize=(30, 3))
ax.plot(transition_probabilities, "o")
ax.set_title("Transition probability of state")
ax.set_xlabel("hidden state")
ax.set_ylabel("probability")
ax.set_xticks([i for i in range(len(transition_probabilities))]) # pylint: disable=unnecessary-comprehension
ax.set_xticklabels([int(x) for x in states], rotation=90)
plt.tight_layout()
if not output_fig:
plt.close()
return fig

View File

@ -50,7 +50,6 @@ class GravesAttention(nn.Module):
COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
def __init__(self, query_dim, K):
super().__init__()
self._mask_value = 1e-8
self.K = K

View File

@ -83,7 +83,6 @@ class ReferenceEncoder(nn.Module):
"""
def __init__(self, num_mel, out_dim):
super().__init__()
self.num_mel = num_mel
filters = [1] + [32, 32, 64, 64, 128, 128]

View File

@ -31,7 +31,6 @@ class ReferenceEncoder(nn.Module):
"""
def __init__(self, num_mel, embedding_dim):
super().__init__()
self.num_mel = num_mel
filters = [1] + [32, 32, 64, 64, 128, 128]
@ -119,7 +118,6 @@ class MultiHeadAttention(nn.Module):
"""
def __init__(self, query_dim, key_dim, num_units, num_heads):
super().__init__()
self.num_units = num_units
self.num_heads = num_heads

Some files were not shown because too many files have changed in this diff Show More