Update generic_utils.py (#3561 )

Handles cases when git branch produces no output or invalid output. Right now, it just crashes with `StopIteration`
Bug fix in MP3 and FLAC compute length on TTSDataset (#3092 )
2024-02-10 11:20:58 -03:00 · 2023-12-27 13:23:43 -03:00 · 2023-12-14 18:00:30 +01:00 · 2023-12-14 14:26:31 +01:00 · 2023-12-13 08:54:57 +01:00 · 2023-12-13 08:53:43 +01:00
480 changed files with 152112 additions and 2066 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,2 +1,9 @@
 .git/
 Dockerfile
+build/
+dist/
+TTS.egg-info/
+tests/outputs/*
+tests/train_outputs/*
+__pycache__/
+*.pyc
--- a/.github/workflows/aux_tests.yml
+++ b/.github/workflows/aux_tests.yml
@ -18,12 +18,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
@ -31,6 +31,8 @@ jobs:
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
      - name: Install dependencies
        run: |
          sudo apt-get update
--- a/.github/workflows/data_tests.yml
+++ b/.github/workflows/data_tests.yml
@ -18,12 +18,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
@ -31,6 +31,8 @@ jobs:
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
      - name: Install dependencies
        run: |
          sudo apt-get update
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@ -15,8 +15,8 @@ jobs:
      matrix:
        arch: ["amd64"]
        base:
-        - "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled
-        - "ubuntu:20.04" # CPU only
+        - "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
+        - "python:3.10.8-slim" # CPU only
    steps:
      - uses: actions/checkout@v2
      - name: Log in to the Container registry
@ -32,7 +32,7 @@ jobs:
          base="ghcr.io/coqui-ai/tts"
          tags="" # PR build

-          if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then
+          if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
            base="ghcr.io/coqui-ai/tts-cpu"
          fi

--- a/.github/workflows/inference_tests.yml
+++ b/.github/workflows/inference_tests.yml
@ -18,12 +18,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
@ -31,10 +31,14 @@ jobs:
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
+      - name: set ENV
+        run: |
+          export TRAINER_TELEMETRY=0
      - name: Install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends git make gcc
+          sudo apt-get install espeak-ng
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@ -10,7 +10,7 @@ jobs:
  build-sdist:
    runs-on: ubuntu-20.04
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Verify tag matches version
        run: |
          set -ex
@ -21,7 +21,7 @@ jobs:
          fi
      - uses: actions/setup-python@v2
        with:
-          python-version: 3.8
+          python-version: 3.9
      - run: |
          python -m pip install -U pip setuptools wheel build
      - run: |
@ -36,9 +36,9 @@ jobs:
    runs-on: ubuntu-20.04
    strategy:
      matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.9", "3.10", "3.11"]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
@ -64,14 +64,6 @@ jobs:
        with:
          name: "sdist"
          path: "dist/"
-      - uses: actions/download-artifact@v2
-        with:
-          name: "wheel-3.7"
-          path: "dist/"
-      - uses: actions/download-artifact@v2
-        with:
-          name: "wheel-3.8"
-          path: "dist/"
      - uses: actions/download-artifact@v2
        with:
          name: "wheel-3.9"
@ -80,6 +72,10 @@ jobs:
        with:
          name: "wheel-3.10"
          path: "dist/"
+      - uses: actions/download-artifact@v2
+        with:
+          name: "wheel-3.11"
+          path: "dist/"
      - run: |
          ls -lh dist/
      - name: Setup PyPI config
@ -91,7 +87,7 @@ jobs:
          EOF
      - uses: actions/setup-python@v2
        with:
-          python-version: 3.8
+          python-version: 3.9
      - run: |
          python -m pip install twine
      - run: |
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.9]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
@ -42,6 +42,5 @@ jobs:
        run: |
          python3 -m pip install .[all]
          python3 setup.py egg_info
-      - name: Lint check
-        run: |
-          make lint
+      - name: Style check
+        run: make style
--- a/.github/workflows/text_tests.yml
+++ b/.github/workflows/text_tests.yml
@ -18,12 +18,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
@ -31,6 +31,8 @@ jobs:
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
      - name: Install dependencies
        run: |
          sudo apt-get update
--- a/.github/workflows/tts_tests.yml
+++ b/.github/workflows/tts_tests.yml
@ -18,12 +18,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
@ -31,6 +31,8 @@ jobs:
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
      - name: Install dependencies
        run: |
          sudo apt-get update
--- a/.github/workflows/tts_tests2.yml
+++ b/.github/workflows/tts_tests2.yml
@ -0,0 +1,53 @@
+name: tts-tests2
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  check_skip:
+    runs-on: ubuntu-latest
+    if: "! contains(github.event.head_commit.message, '[ci skip]')"
+    steps:
+      - run: echo "${{ github.event.head_commit.message }}"
+
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9, "3.10", "3.11"]
+        experimental: [false]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: 'requirements*'
+      - name: check OS
+        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends git make gcc
+          sudo apt-get install espeak
+          sudo apt-get install espeak-ng
+          make system-deps
+      - name: Install/upgrade Python setup deps
+        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
+      - name: Install TTS
+        run: |
+          python3 -m pip install .[all]
+          python3 setup.py egg_info
+      - name: Unit tests
+        run: make test_tts2
--- a/.github/workflows/vocoder_tests.yml
+++ b/.github/workflows/vocoder_tests.yml
@ -18,12 +18,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
@ -31,6 +31,8 @@ jobs:
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
      - name: Install dependencies
        run: |
          sudo apt-get update
--- a/.github/workflows/xtts_tests.yml
+++ b/.github/workflows/xtts_tests.yml
@ -0,0 +1,53 @@
+name: xtts-tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  check_skip:
+    runs-on: ubuntu-latest
+    if: "! contains(github.event.head_commit.message, '[ci skip]')"
+    steps:
+      - run: echo "${{ github.event.head_commit.message }}"
+
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9, "3.10", "3.11"]
+        experimental: [false]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: 'requirements*'
+      - name: check OS
+        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends git make gcc
+          sudo apt-get install espeak
+          sudo apt-get install espeak-ng
+          make system-deps
+      - name: Install/upgrade Python setup deps
+        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
+      - name: Install TTS
+        run: |
+          python3 -m pip install .[all]
+          python3 setup.py egg_info
+      - name: Unit tests
+        run: make test_xtts
--- a/.github/workflows/zoo_tests0.yml
+++ b/.github/workflows/zoo_tests0.yml
@ -0,0 +1,54 @@
+name: zoo-tests-0
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  check_skip:
+    runs-on: ubuntu-latest
+    if: "! contains(github.event.head_commit.message, '[ci skip]')"
+    steps:
+      - run: echo "${{ github.event.head_commit.message }}"
+
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9, "3.10", "3.11"]
+        experimental: [false]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: 'requirements*'
+      - name: check OS
+        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y git make gcc
+          sudo apt-get install espeak espeak-ng
+          make system-deps
+      - name: Install/upgrade Python setup deps
+        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
+      - name: Install TTS
+        run: |
+          python3 -m pip install .[all]
+          python3 setup.py egg_info
+      - name: Unit tests
+        run: |
+          nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3
+          nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion
--- a/.github/workflows/zoo_tests1.yml
+++ b/.github/workflows/zoo_tests1.yml
@ -0,0 +1,53 @@
+name: zoo-tests-1
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  check_skip:
+    runs-on: ubuntu-latest
+    if: "! contains(github.event.head_commit.message, '[ci skip]')"
+    steps:
+      - run: echo "${{ github.event.head_commit.message }}"
+
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9, "3.10", "3.11"]
+        experimental: [false]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: 'requirements*'
+      - name: check OS
+        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y git make gcc
+          sudo apt-get install espeak espeak-ng
+          make system-deps
+      - name: Install/upgrade Python setup deps
+        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
+      - name: Install TTS
+        run: |
+          python3 -m pip install .[all]
+          python3 setup.py egg_info
+      - name: Unit tests
+        run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3
--- a/.github/workflows/zoo_tests2.yml
+++ b/.github/workflows/zoo_tests2.yml
@ -1,4 +1,4 @@
-name: zoo-tests
+name: zoo-tests-2

 on:
  push:
@ -18,12 +18,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
@ -31,6 +31,8 @@ jobs:
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
      - name: Install dependencies
        run: |
          sudo apt-get update
@ -47,4 +49,4 @@ jobs:
          python3 -m pip install .[all]
          python3 setup.py egg_info
      - name: Unit tests
-        run: make test_zoo
+        run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3
--- a/.gitignore
+++ b/.gitignore
@ -137,7 +137,7 @@ VCTK-Corpus-removed-silence/*
 # ignore training logs
 trainer_*_log.txt

-# files used internally fro dev, test etc.
+# files used internally for dev, test etc.
 tests/outputs/*
 tests/train_outputs/*
 TODO.txt
@ -168,3 +168,5 @@ internal/*
 wandb
 depot/*
 coqui_recipes/*
+local_scripts/*
+coqui_demos/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -6,7 +6,7 @@ repos:
      - id: end-of-file-fixer
      - id: trailing-whitespace
  - repo: 'https://github.com/psf/black'
-    rev: 20.8b1
+    rev: 22.3.0
    hooks:
      - id: black
        language_version: python3
--- a/.pylintrc
+++ b/.pylintrc
@ -169,7 +169,9 @@ disable=missing-docstring,
        comprehension-escape,
        duplicate-code,
        not-callable,
-        import-outside-toplevel
+        import-outside-toplevel,
+        logging-fstring-interpolation,
+        logging-not-lazy

 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -5,14 +5,19 @@
 # Required
 version: 2

+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+  install:
+    - requirements: docs/requirements.txt
+    - requirements: requirements.txt
+
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
  builder: html
  configuration: docs/source/conf.py
-
-# Optionally set the version of Python and requirements required to build your docs
-python:
-  version: 3.7
-  install:
-    - requirements: docs/requirements.txt
-    - requirements: requirements.txt
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -36,7 +36,7 @@ This model can be shared in two ways:

 Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.

-Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380). 
+Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/discussions/930).

 ## Sending a ✨**PR**✨

@ -48,7 +48,7 @@ The following steps are tested on an Ubuntu system.

 1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.

-2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
+2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.

    ```bash
    $ git clone git@github.com:<your Github name>/TTS.git
@ -128,6 +128,32 @@ The following steps are tested on an Ubuntu system.

 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.

+## Development in Docker container
+
+If you prefer working within a Docker container as your development environment, you can do the following:
+
+1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
+
+2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
+
+    ```bash
+    $ git clone git@github.com:<your Github name>/TTS.git
+    $ cd TTS
+    $ git remote add upstream https://github.com/coqui-ai/TTS.git
+    ```
+
+3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
+
+    ```
+    docker build --tag=tts-dev:latest -f .\dockerfiles\Dockerfile.dev .
+    ```
+
+4. Run the container with GPU support:
+
+    ```
+    docker run -it --gpus all tts-dev:latest /bin/bash
+    ```
+
 Feel free to ping us at any step you need help using our communication channels.

 If you are new to Github or open-source contribution, These are good resources.
--- a/23
+++ b/23
@ -1,20 +1,19 @@
-ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3
+ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
 FROM ${BASE}
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make  python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
-RUN pip install llvmlite --ignore-installed

-# Create and activate virtual env
-ENV VIRTUAL_ENV=/venv
-RUN python3 -m venv $VIRTUAL_ENV
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
-RUN pip install -U pip setuptools wheel
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
+RUN pip3 install llvmlite --ignore-installed

+# Install Dependencies:
+RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
+RUN rm -rf /root/.cache/pip
+
+# Copy TTS repository contents:
 WORKDIR /root
-COPY requirements.txt /root
-COPY requirements.dev.txt /root
-COPY requirements.notebooks.txt /root
-RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
 COPY . /root
+
 RUN make install
+
 ENTRYPOINT ["tts"]
 CMD ["--help"]
--- a/6
+++ b/6
@ -19,6 +19,12 @@ test_vocoder:	## run vocoder tests.
 test_tts:	## run tts tests.
 	nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests

+test_tts2:	## run tts tests.
+	nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests2
+
+test_xtts:
+	nose2 -F -v -B --with-coverage --coverage TTS tests.xtts_tests
+
 test_aux:	## run aux tests.
 	nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
 	./run_bash_tests.sh
--- a/README.md
+++ b/README.md
@ -1,9 +1,29 @@
-# <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>

-🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
-🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
+## 🐸Coqui.ai News
+- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
+- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
+- 📣 ⓍTTS can now stream with <200ms latency.
+- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
+- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
+- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
+- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)

-[![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
+<div align="center">
+<img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
+
+## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
+
+
+**🐸TTS is a library for advanced Text-to-Speech generation.**
+
+🚀 Pretrained models in +1100 languages.
+
+🛠️ Tools for training new models and fine-tuning existing models in any language.
+
+📚 Utilities for dataset analysis and curation.
+______________________________________________________________________
+
+[![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
 [![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
 [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
 [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
@ -18,16 +38,14 @@
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/text_tests.yml/badge.svg)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/tts_tests.yml/badge.svg)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/vocoder_tests.yml/badge.svg)
-![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests0.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests1.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests2.yml/badge.svg)
 [![Docs](<https://readthedocs.org/projects/tts/badge/?version=latest&style=plastic>)](https://tts.readthedocs.io/en/latest/)

-📰 [**Subscribe to 🐸Coqui.ai Newsletter**](https://coqui.ai/?subscription=true)
+</div>

-📢 [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2)
-
-📄 [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers)
-
-<img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
+______________________________________________________________________

 ## 💬 Where to ask questions
 Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it.
@ -36,12 +54,12 @@ Please use our dedicated channels for questions and discussion. Help is much mor
 | ------------------------------- | --------------------------------------- |
 | 🚨 **Bug Reports**              | [GitHub Issue Tracker]                  |
 | 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker]                  |
-| 👩‍💻 **Usage Questions**          | [Github Discussions]                    |
-| 🗯 **General Discussion**       | [Github Discussions] or [Gitter Room]   |
+| 👩‍💻 **Usage Questions**          | [GitHub Discussions]                    |
+| 🗯 **General Discussion**       | [GitHub Discussions] or [Discord]   |

 [github issue tracker]: https://github.com/coqui-ai/tts/issues
 [github discussions]: https://github.com/coqui-ai/TTS/discussions
-[gitter room]: https://gitter.im/coqui-ai/TTS?utm_source=share-link&utm_medium=link&utm_campaign=share-link
+[discord]: https://discord.gg/5eXr5seRrv
 [Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials


@ -49,16 +67,17 @@ Please use our dedicated channels for questions and discussion. Help is much mor
 | Type                            | Links                               |
 | ------------------------------- | --------------------------------------- |
 | 💼 **Documentation**              | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
-| 💾 **Installation**               | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#install-tts)|
+| 💾 **Installation**               | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)|
 | 👩‍💻 **Contributing**               | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)|
 | 📌 **Road Map**                   | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
 | 🚀 **Released Models**            | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)|
+| 📰 **Papers**                    | [TTS Papers](https://github.com/erogol/TTS-papers)|
+

 ## 🥇 TTS Performance
 <p align="center"><img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/TTS-performance.png" width="800" /></p>

-Underlined "TTS*" and "Judy*" are 🐸TTS models
-<!-- [Details...](https://github.com/coqui-ai/TTS/wiki/Mean-Opinion-Score-Results) -->
+Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not released open-source. They are here to show the potential. Models prefixed with a dot (.Jofish .Abe and .Janice) are real human voices.

 ## Features
 - High-performance Deep Learning models for Text2Speech tasks.
@ -74,8 +93,8 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - Utilities to use and test your models.
 - Modular (but not too much) code base enabling easy implementation of new ideas.

-## Implemented Models
-### Text-to-Spectrogram
+## Model Implementations
+### Spectrogram models
 - Tacotron: [paper](https://arxiv.org/abs/1703.10135)
 - Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
 - Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
@ -83,9 +102,19 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
 - FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
 - FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
+- FastSpeech2: [paper](https://arxiv.org/abs/2006.04558)
+- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
+- Capacitron: [paper](https://arxiv.org/abs/1906.03402)
+- OverFlow: [paper](https://arxiv.org/abs/2211.06892)
+- Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
+- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612)

 ### End-to-End Models
+- ⓍTTS: [blog](https://coqui.ai/blog/tts/open_xtts)
 - VITS: [paper](https://arxiv.org/pdf/2106.06103)
+- 🐸 YourTTS: [paper](https://arxiv.org/abs/2112.02418)
+- 🐢 Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)
+- 🐶 Bark: [orig. repo](https://github.com/suno-ai/bark)

 ### Attention Methods
 - Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
@ -109,10 +138,13 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - HiFiGAN: [paper](https://arxiv.org/abs/2010.05646)
 - UnivNet: [paper](https://arxiv.org/abs/2106.07889)

+### Voice Conversion
+- FreeVC: [paper](https://arxiv.org/abs/2210.15418)
+
 You can also help us implement more models.

-## Install TTS
-🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**.
+## Installation
+🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**.

 If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.

@ -136,101 +168,225 @@ $ make install

 If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).

-## Use TTS

-### Single Speaker Models
+## Docker Image
+You can also try TTS without install with the docker image.
+Simply run the following command and you will be able to run TTS without installing it.
+
+```bash
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+python3 TTS/server/server.py --list_models #To get the list of available models
+python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
+```
+
+You can then enjoy the TTS server [here](http://[::1]:5002/)
+More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
+
+
+## Synthesizing speech by 🐸TTS
+
+### 🐍 Python API
+
+#### Running a multi-speaker and multi-lingual model
+
+```python
+import torch
+from TTS.api import TTS
+
+# Get device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# List available 🐸TTS models
+print(TTS().list_models())
+
+# Init TTS
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
+
+# Run TTS
+# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
+# Text to speech list of amplitude values as output
+wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
+# Text to speech to a file
+tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
+```
+
+#### Running a single speaker model
+
+```python
+# Init TTS with the target model name
+tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False).to(device)
+
+# Run TTS
+tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
+
+# Example voice cloning with YourTTS in English, French and Portuguese
+tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
+tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
+tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
+tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
+```
+
+#### Example voice conversion
+
+Converting the voice in `source_wav` to the voice of `target_wav`
+
+```python
+tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")
+tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
+```
+
+#### Example voice cloning together with the voice conversion model.
+This way, you can clone voices by using any model in 🐸TTS.
+
+```python
+
+tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
+tts.tts_with_vc_to_file(
+    "Wie sage ich auf Italienisch, dass ich dich liebe?",
+    speaker_wav="target/speaker.wav",
+    file_path="output.wav"
+)
+```
+
+#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
+For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
+You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
+and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
+
+```python
+# TTS with on the fly voice conversion
+api = TTS("tts_models/deu/fairseq/vits")
+api.tts_with_vc_to_file(
+    "Wie sage ich auf Italienisch, dass ich dich liebe?",
+    speaker_wav="target/speaker.wav",
+    file_path="output.wav"
+)
+```
+
+### Command-line `tts`
+
+<!-- begin-tts-readme -->
+
+Synthesize speech on command line.
+
+You can either use your trained model or choose a model from the provided list.
+
+If you don't specify any models, then it uses LJSpeech based English model.
+
+#### Single Speaker Models

 - List provided models:

-    ```
-    $ tts --list_models
-    ```
+  ```
+  $ tts --list_models
+  ```
+
 - Get model info (for both tts_models and vocoder_models):
-    - Query by type/name:
-        The model_info_by_name uses the name as it from the --list_models. 
-        ```
-        $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-        ```
-        For example:
-        
-        ```
-        $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
-        ```
-        ```
-        $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
-        ```
-    - Query by type/idx:
-        The model_query_idx uses the corresponding idx from --list_models. 
-        ```
-        $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
-        ```
-        For example:
-        
-        ```
-        $ tts --model_info_by_idx tts_models/3 
-        ```
-        
- Run TTS with default models:
+
+  - Query by type/name:
+    The model_info_by_name uses the name as it from the --list_models.
+    ```
+    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+    ```
+    For example:
+    ```
+    $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+    $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+    ```
+  - Query by type/idx:
+    The model_query_idx uses the corresponding idx from --list_models.

    ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav
+    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
    ```

+    For example:
+
+    ```
+    $ tts --model_info_by_idx tts_models/3
+    ```
+
+  - Query info for model info by full name:
+    ```
+    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+    ```
+
+- Run TTS with default models:
+
+  ```
+  $ tts --text "Text for TTS" --out_path output/path/speech.wav
+  ```
+
+- Run TTS and pipe out the generated TTS wav file data:
+
+  ```
+  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+  ```
+
 - Run a TTS model with its default vocoder model:

-    ```
-    $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
-    ```
+  ```
+  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```
+
  For example:

-    ```
-    $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
-    ```
+  ```
+  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
+  ```

 - Run with specific TTS and vocoder models from the list:

-    ```
-    $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
-    ```
+  ```
+  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```

  For example:

-    ```
-    $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
-    ```
-
+  ```
+  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+  ```

 - Run your own TTS model (Using Griffin-Lim Vocoder):

-    ```
-    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
-    ```
+  ```
+  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+  ```

 - Run your own TTS and Vocoder models:
-    ```
-    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
-        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
-    ```

-### Multi-speaker Models
+  ```
+  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+      --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+  ```

- List the available speakers and choose as <speaker_id> among them:
+#### Multi-speaker Models

-    ```
-    $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
-    ```
+- List the available speakers and choose a <speaker_id> among them:
+
+  ```
+  $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
+  ```

 - Run the multi-speaker TTS model with the target speaker ID:

-    ```
-    $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
-    ```
+  ```
+  $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
+  ```

 - Run your own multi-speaker TTS model:

-    ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
-    ```
+  ```
+  $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+  ```
+
+### Voice Conversion Models
+
+```
+$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
+```
+
+<!-- end-tts-readme -->

 ## Directory Structure
 ```
@ -239,8 +395,6 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 |- TTS
    |- bin/             (folder for all the executables.)
      |- train*.py                  (train your target model.)
-      |- distribute.py              (train your TTS model using Multiple GPUs.)
-      |- compute_statistics.py      (compute dataset statistics for normalization.)
      |- ...
    |- tts/             (text to speech models)
        |- layers/          (model layer definitions)
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -1,14 +1,115 @@
 {
    "tts_models": {
-        "multilingual":{
-            "multi-dataset":{
-                "your_tts":{
+        "multilingual": {
+            "multi-dataset": {
+                "xtts_v2": {
+                    "description": "XTTS-v2.0.3 by Coqui with 17 languages.",
+                    "hf_url": [
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
+                    ],
+                    "model_hash": "10f92b55c512af7a8d39d650547a15a7",
+                    "default_vocoder": null,
+                    "commit": "480a6cdf7",
+                    "license": "CPML",
+                    "contact": "info@coqui.ai",
+                    "tos_required": true
+                },
+                "xtts_v1.1": {
+                    "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
+                    "hf_url": [
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
+                    ],
+                    "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
+                    "default_vocoder": null,
+                    "commit": "82910a63",
+                    "license": "CPML",
+                    "contact": "info@coqui.ai",
+                    "tos_required": true
+                },
+                "your_tts": {
                    "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
                    "default_vocoder": null,
                    "commit": "e9a1953e",
                    "license": "CC BY-NC-ND 4.0",
                    "contact": "egolge@coqui.ai"
+                },
+                "bark": {
+                    "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
+                    "hf_url": [
+                        "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
+                        "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
+                        "https://coqui.gateway.scarf.sh/hf/text_2.pt",
+                        "https://coqui.gateway.scarf.sh/hf/bark/config.json",
+                        "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
+                        "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
+                    ],
+                    "default_vocoder": null,
+                    "commit": "e9a1953e",
+                    "license": "MIT",
+                    "contact": "https://www.suno.ai/"
+                }
+            }
+        },
+        "bg": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "cs": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "da": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "et": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "ga": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
                }
            }
        },
@ -79,6 +180,14 @@
                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                },
+                "vits--neon": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
+                    "default_vocoder": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause",
+                    "contact": null,
+                    "commit": null
+                },
                "fast_pitch": {
                    "description": "FastPitch model trained on LJSpeech using the Aligner Network",
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
@ -87,6 +196,24 @@
                    "author": "Eren Gölge @erogol",
                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
+                },
+                "overflow": {
+                    "description": "Overflow model trained on LJSpeech",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
+                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+                    "commit": "3b1a28f",
+                    "author": "Eren Gölge @erogol",
+                    "license": "apache 2.0",
+                    "contact": "egolge@coqui.ai"
+                },
+                "neural_hmm": {
+                    "description": "Neural HMM model trained on LJSpeech",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
+                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+                    "commit": "3b1a28f",
+                    "author": "Shivam Metha @shivammehta25",
+                    "license": "apache 2.0",
+                    "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
                }
            },
            "vctk": {
@ -99,7 +226,7 @@
                    "license": "apache 2.0",
                    "contact": "egolge@coqui.ai"
                },
-                "fast_pitch":{
+                "fast_pitch": {
                    "description": "FastPitch model trained on VCTK dataseset.",
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
                    "default_vocoder": null,
@ -130,15 +257,45 @@
                    "license": "apache 2.0",
                    "contact": "adamfroghyar@gmail.com"
                },
-                "capacitron-t2-c150": {
+                "capacitron-t2-c150_v2": {
                    "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
-                    "commit": "d6284e7",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
+                    "commit": "a67039d",
                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
                    "author": "Adam Froghyar @a-froghyar",
                    "license": "apache 2.0",
                    "contact": "adamfroghyar@gmail.com"
                }
+            },
+            "multi-dataset": {
+                "tortoise-v2": {
+                    "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
+                    "github_rls_url": [
+                        "https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
+                        "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
+                        "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
+                        "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
+                        "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
+                        "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
+                        "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
+                        "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
+                        "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
+                    ],
+                    "commit": "c1875f6",
+                    "default_vocoder": null,
+                    "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
+                    "license": "apache 2.0"
+                }
+            },
+            "jenny": {
+                "jenny": {
+                    "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
+                    "default_vocoder": null,
+                    "commit": "ba40a1c",
+                    "license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
+                    "author": "@noml4u"
+                }
            }
        },
        "es": {
@ -151,6 +308,15 @@
                    "license": "MPL",
                    "contact": "egolge@coqui.com"
                }
+            },
+            "css10": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
            }
        },
        "fr": {
@ -158,22 +324,38 @@
                "tacotron2-DDC": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
-                    "commit": "",
+                    "commit": null,
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
                    "contact": "egolge@coqui.com"
                }
+            },
+            "css10": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
            }
        },
-        "uk":{
+        "uk": {
            "mai": {
                "glow-tts": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
-                    "author":"@robinhad",
+                    "author": "@robinhad",
                    "commit": "bdab788d",
                    "license": "MIT",
                    "contact": "",
                    "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
+                },
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
                }
            }
        },
@ -198,6 +380,15 @@
                    "stats_file": null,
                    "commit": "540d811"
                }
+            },
+            "css10": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
            }
        },
        "de": {
@ -215,6 +406,23 @@
                    "author": "@thorstenMueller",
                    "license": "apache 2.0",
                    "commit": "unknown"
+                },
+                "tacotron2-DDC": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
+                    "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
+                    "description": "Thorsten-Dec2021-22k-DDC",
+                    "author": "@thorstenMueller",
+                    "license": "apache 2.0",
+                    "commit": "unknown"
+                }
+            },
+            "css10": {
+                "vits-neon": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
+                    "default_vocoder": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause",
+                    "commit": null
                }
            }
        },
@ -230,9 +438,9 @@
                }
            }
        },
-        "tr":{
+        "tr": {
            "common-voice": {
-                "glow-tts":{
+                "glow-tts": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
                    "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
                    "license": "MIT",
@ -244,7 +452,7 @@
        },
        "it": {
            "mai_female": {
-                "glow-tts":{
+                "glow-tts": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
                    "default_vocoder": null,
                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -252,7 +460,7 @@
                    "license": "apache 2.0",
                    "commit": null
                },
-                "vits":{
+                "vits": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
                    "default_vocoder": null,
                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -262,7 +470,7 @@
                }
            },
            "mai_male": {
-                "glow-tts":{
+                "glow-tts": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
                    "default_vocoder": null,
                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -270,7 +478,7 @@
                    "license": "apache 2.0",
                    "commit": null
                },
-                "vits":{
+                "vits": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
                    "default_vocoder": null,
                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -282,7 +490,7 @@
        },
        "ewe": {
            "openbible": {
-                "vits":{
+                "vits": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
                    "default_vocoder": null,
                    "license": "CC-BY-SA 4.0",
@ -294,7 +502,7 @@
        },
        "hau": {
            "openbible": {
-                "vits":{
+                "vits": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
                    "default_vocoder": null,
                    "license": "CC-BY-SA 4.0",
@ -306,7 +514,7 @@
        },
        "lin": {
            "openbible": {
-                "vits":{
+                "vits": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
                    "default_vocoder": null,
                    "license": "CC-BY-SA 4.0",
@ -318,7 +526,7 @@
        },
        "tw_akuapem": {
            "openbible": {
-                "vits":{
+                "vits": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
                    "default_vocoder": null,
                    "license": "CC-BY-SA 4.0",
@ -330,7 +538,7 @@
        },
        "tw_asante": {
            "openbible": {
-                "vits":{
+                "vits": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
                    "default_vocoder": null,
                    "license": "CC-BY-SA 4.0",
@ -342,7 +550,7 @@
        },
        "yor": {
            "openbible": {
-                "vits":{
+                "vits": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
                    "default_vocoder": null,
                    "license": "CC-BY-SA 4.0",
@ -351,6 +559,205 @@
                    "commit": "1b22f03"
                }
            }
+        },
+        "hu": {
+            "css10": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "el": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "fi": {
+            "css10": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "hr": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "lt": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "lv": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "mt": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "pl": {
+            "mai_female": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "pt": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "ro": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "sk": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "sl": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "sv": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "ca": {
+            "custom": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
+                    "author": "@gullabi",
+                    "license": "CC-BY-4.0"
+                }
+            }
+        },
+        "fa": {
+            "custom": {
+                "glow-tts": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
+                    "author": "@karim23657",
+                    "license": "CC-BY-4.0"
+                }
+            }
+        },
+        "bn": {
+            "custom": {
+                "vits-male": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
+                    "author": "@mobassir94",
+                    "license": "Apache 2.0"
+                },
+                "vits-female": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
+                    "author": "@mobassir94",
+                    "license": "Apache 2.0"
+                }
+            }
+        },
+        "be": {
+            "common-voice": {
+                "glow-tts":{
+                    "description": "Belarusian GlowTTS model created by @alex73 (Github).",
+                    "github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
+                    "default_vocoder": "vocoder_models/be/common-voice/hifigan",
+                    "commit": "c0aabb85",
+                    "license": "CC-BY-SA 4.0",
+                    "contact": "alex73mail@gmail.com"
+                }
+            }
        }
    },
    "vocoder_models": {
@ -460,6 +867,13 @@
                    "author": "@thorstenMueller",
                    "license": "apache 2.0",
                    "commit": "unknown"
+                },
+                "hifigan_v1": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
+                    "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
+                    "author": "@thorstenMueller",
+                    "license": "apache 2.0",
+                    "commit": "unknown"
                }
            }
        },
@ -478,16 +892,16 @@
            "mai": {
                "multiband-melgan": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
-                    "author":"@robinhad",
+                    "author": "@robinhad",
                    "commit": "bdab788d",
                    "license": "MIT",
                    "contact": ""
                }
            }
        },
-        "tr":{
+        "tr": {
            "common-voice": {
-                "hifigan":{
+                "hifigan": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
                    "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
                    "author": "Fatih Akademi",
@ -495,6 +909,30 @@
                    "commit": null
                }
            }
+        },
+        "be": {
+            "common-voice": {
+                "hifigan": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
+                    "description": "Belarusian HiFiGAN model created by @alex73 (Github).",
+                    "author": "@alex73",
+                    "license": "CC-BY-SA 4.0",
+                    "commit": "c0aabb85"
+                }
+            }
+        }
+    },
+    "voice_conversion_models": {
+        "multilingual": {
+            "vctk": {
+                "freevc24": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
+                    "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
+                    "author": "Jing-Yi Li @OlaWod",
+                    "license": "MIT",
+                    "commit": null
+                }
+            }
        }
    }
-}
+}
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -1 +1 @@
-0.7.1
+0.22.0
--- a/TTS/api.py
+++ b/TTS/api.py
@ -0,0 +1,458 @@
+import tempfile
+import warnings
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+from torch import nn
+
+from TTS.utils.audio.numpy_transforms import save_wav
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+from TTS.config import load_config
+
+
+class TTS(nn.Module):
+    """TODO: Add voice conversion and Capacitron support."""
+
+    def __init__(
+        self,
+        model_name: str = "",
+        model_path: str = None,
+        config_path: str = None,
+        vocoder_path: str = None,
+        vocoder_config_path: str = None,
+        progress_bar: bool = True,
+        gpu=False,
+    ):
+        """🐸TTS python interface that allows to load and use the released models.
+
+        Example with a multi-speaker model:
+            >>> from TTS.api import TTS
+            >>> tts = TTS(TTS.list_models()[0])
+            >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
+            >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
+
+        Example with a single-speaker model:
+            >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
+            >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
+
+        Example loading a model from a path:
+            >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
+            >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
+
+        Example voice cloning with YourTTS in English, French and Portuguese:
+            >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+            >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
+            >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
+            >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
+
+        Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
+            >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
+            >>> tts.tts_to_file("This is a test.", file_path="output.wav")
+
+        Args:
+            model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
+            model_path (str, optional): Path to the model checkpoint. Defaults to None.
+            config_path (str, optional): Path to the model config. Defaults to None.
+            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
+            vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
+            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+        super().__init__()
+        self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
+        self.config = load_config(config_path) if config_path else None
+        self.synthesizer = None
+        self.voice_converter = None
+        self.model_name = ""
+        if gpu:
+            warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
+
+        if model_name is not None and len(model_name) > 0:
+            if "tts_models" in model_name:
+                self.load_tts_model_by_name(model_name, gpu)
+            elif "voice_conversion_models" in model_name:
+                self.load_vc_model_by_name(model_name, gpu)
+            else:
+                self.load_model_by_name(model_name, gpu)
+
+        if model_path:
+            self.load_tts_model_by_path(
+                model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
+            )
+
+    @property
+    def models(self):
+        return self.manager.list_tts_models()
+
+    @property
+    def is_multi_speaker(self):
+        if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
+            return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
+        return False
+
+    @property
+    def is_multi_lingual(self):
+        # Not sure what sets this to None, but applied a fix to prevent crashing.
+        if (
+            isinstance(self.model_name, str)
+            and "xtts" in self.model_name
+            or self.config
+            and ("xtts" in self.config.model or len(self.config.languages) > 1)
+        ):
+            return True
+        if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
+            return self.synthesizer.tts_model.language_manager.num_languages > 1
+        return False
+
+    @property
+    def speakers(self):
+        if not self.is_multi_speaker:
+            return None
+        return self.synthesizer.tts_model.speaker_manager.speaker_names
+
+    @property
+    def languages(self):
+        if not self.is_multi_lingual:
+            return None
+        return self.synthesizer.tts_model.language_manager.language_names
+
+    @staticmethod
+    def get_models_file_path():
+        return Path(__file__).parent / ".models.json"
+
+    def list_models(self):
+        return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
+
+    def download_model_by_name(self, model_name: str):
+        model_path, config_path, model_item = self.manager.download_model(model_name)
+        if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
+            # return model directory if there are multiple files
+            # we assume that the model knows how to load itself
+            return None, None, None, None, model_path
+        if model_item.get("default_vocoder") is None:
+            return model_path, config_path, None, None, None
+        vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
+        return model_path, config_path, vocoder_path, vocoder_config_path, None
+
+    def load_model_by_name(self, model_name: str, gpu: bool = False):
+        """Load one of the 🐸TTS models by name.
+
+        Args:
+            model_name (str): Model name to load. You can list models by ```tts.models```.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+        self.load_tts_model_by_name(model_name, gpu)
+
+    def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
+        """Load one of the voice conversion models by name.
+
+        Args:
+            model_name (str): Model name to load. You can list models by ```tts.models```.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+        self.model_name = model_name
+        model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
+        self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
+
+    def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
+        """Load one of 🐸TTS models by name.
+
+        Args:
+            model_name (str): Model name to load. You can list models by ```tts.models```.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+
+        TODO: Add tests
+        """
+        self.synthesizer = None
+        self.model_name = model_name
+
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name
+        )
+
+        # init synthesizer
+        # None values are fetch from the model
+        self.synthesizer = Synthesizer(
+            tts_checkpoint=model_path,
+            tts_config_path=config_path,
+            tts_speakers_file=None,
+            tts_languages_file=None,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
+            encoder_checkpoint=None,
+            encoder_config=None,
+            model_dir=model_dir,
+            use_cuda=gpu,
+        )
+
+    def load_tts_model_by_path(
+        self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
+    ):
+        """Load a model from a path.
+
+        Args:
+            model_path (str): Path to the model checkpoint.
+            config_path (str): Path to the model config.
+            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
+            vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+
+        self.synthesizer = Synthesizer(
+            tts_checkpoint=model_path,
+            tts_config_path=config_path,
+            tts_speakers_file=None,
+            tts_languages_file=None,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config,
+            encoder_checkpoint=None,
+            encoder_config=None,
+            use_cuda=gpu,
+        )
+
+    def _check_arguments(
+        self,
+        speaker: str = None,
+        language: str = None,
+        speaker_wav: str = None,
+        emotion: str = None,
+        speed: float = None,
+        **kwargs,
+    ) -> None:
+        """Check if the arguments are valid for the model."""
+        # check for the coqui tts models
+        if self.is_multi_speaker and (speaker is None and speaker_wav is None):
+            raise ValueError("Model is multi-speaker but no `speaker` is provided.")
+        if self.is_multi_lingual and language is None:
+            raise ValueError("Model is multi-lingual but no `language` is provided.")
+        if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
+            raise ValueError("Model is not multi-speaker but `speaker` is provided.")
+        if not self.is_multi_lingual and language is not None:
+            raise ValueError("Model is not multi-lingual but `language` is provided.")
+        if not emotion is None and not speed is None:
+            raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
+
+    def tts(
+        self,
+        text: str,
+        speaker: str = None,
+        language: str = None,
+        speaker_wav: str = None,
+        emotion: str = None,
+        speed: float = None,
+        split_sentences: bool = True,
+        **kwargs,
+    ):
+        """Convert text to speech.
+
+        Args:
+            text (str):
+                Input text to synthesize.
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
+                supported by `XTTS` model.
+            speaker_wav (str, optional):
+                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+                Defaults to None.
+            emotion (str, optional):
+                Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
+            speed (float, optional):
+                Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
+                Defaults to None.
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
+            kwargs (dict, optional):
+                Additional arguments for the model.
+        """
+        self._check_arguments(
+            speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
+        )
+        wav = self.synthesizer.tts(
+            text=text,
+            speaker_name=speaker,
+            language_name=language,
+            speaker_wav=speaker_wav,
+            reference_wav=None,
+            style_wav=None,
+            style_text=None,
+            reference_speaker_name=None,
+            split_sentences=split_sentences,
+            **kwargs,
+        )
+        return wav
+
+    def tts_to_file(
+        self,
+        text: str,
+        speaker: str = None,
+        language: str = None,
+        speaker_wav: str = None,
+        emotion: str = None,
+        speed: float = 1.0,
+        pipe_out=None,
+        file_path: str = "output.wav",
+        split_sentences: bool = True,
+        **kwargs,
+    ):
+        """Convert text to speech.
+
+        Args:
+            text (str):
+                Input text to synthesize.
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            language (str, optional):
+                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+            speaker_wav (str, optional):
+                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+                Defaults to None.
+            emotion (str, optional):
+                Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
+            speed (float, optional):
+                Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
+            pipe_out (BytesIO, optional):
+                Flag to stdout the generated TTS wav file for shell pipe.
+            file_path (str, optional):
+                Output file path. Defaults to "output.wav".
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
+            kwargs (dict, optional):
+                Additional arguments for the model.
+        """
+        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
+
+        wav = self.tts(
+            text=text,
+            speaker=speaker,
+            language=language,
+            speaker_wav=speaker_wav,
+            split_sentences=split_sentences,
+            **kwargs,
+        )
+        self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
+        return file_path
+
+    def voice_conversion(
+        self,
+        source_wav: str,
+        target_wav: str,
+    ):
+        """Voice conversion with FreeVC. Convert source wav to target speaker.
+
+        Args:``
+            source_wav (str):
+                Path to the source wav file.
+            target_wav (str):`
+                Path to the target wav file.
+        """
+        wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
+        return wav
+
+    def voice_conversion_to_file(
+        self,
+        source_wav: str,
+        target_wav: str,
+        file_path: str = "output.wav",
+    ):
+        """Voice conversion with FreeVC. Convert source wav to target speaker.
+
+        Args:
+            source_wav (str):
+                Path to the source wav file.
+            target_wav (str):
+                Path to the target wav file.
+            file_path (str, optional):
+                Output file path. Defaults to "output.wav".
+        """
+        wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
+        save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
+        return file_path
+
+    def tts_with_vc(
+        self,
+        text: str,
+        language: str = None,
+        speaker_wav: str = None,
+        speaker: str = None,
+        split_sentences: bool = True,
+    ):
+        """Convert text to speech with voice conversion.
+
+        It combines tts with voice conversion to fake voice cloning.
+
+        - Convert text to speech with tts.
+        - Convert the output wav to target speaker with voice conversion.
+
+        Args:
+            text (str):
+                Input text to synthesize.
+            language (str, optional):
+                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+            speaker_wav (str, optional):
+                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+                Defaults to None.
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
+        """
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+            # Lazy code... save it to a temp file to resample it while reading it for VC
+            self.tts_to_file(
+                text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
+            )
+        if self.voice_converter is None:
+            self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
+        wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
+        return wav
+
+    def tts_with_vc_to_file(
+        self,
+        text: str,
+        language: str = None,
+        speaker_wav: str = None,
+        file_path: str = "output.wav",
+        speaker: str = None,
+        split_sentences: bool = True,
+    ):
+        """Convert text to speech with voice conversion and save to file.
+
+        Check `tts_with_vc` for more details.
+
+        Args:
+            text (str):
+                Input text to synthesize.
+            language (str, optional):
+                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+            speaker_wav (str, optional):
+                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+                Defaults to None.
+            file_path (str, optional):
+                Output file path. Defaults to "output.wav".
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
+        """
+        wav = self.tts_with_vc(
+            text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
+        )
+        save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -6,79 +6,192 @@ import torch
 from tqdm import tqdm

 from TTS.config import load_config
+from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.managers import save_file
 from TTS.tts.utils.speakers import SpeakerManager

-parser = argparse.ArgumentParser(
-    description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
-    """
-    Example runs:
-    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json  dataset_config.json
-    """,
-    formatter_class=RawTextHelpFormatter,
-)
-parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
-parser.add_argument("config_path", type=str, help="Path to model config file.")
-parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
-parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
-parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
-parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
-parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)

-args = parser.parse_args()
+def compute_embeddings(
+    model_path,
+    config_path,
+    output_path,
+    old_speakers_file=None,
+    old_append=False,
+    config_dataset_path=None,
+    formatter_name=None,
+    dataset_name=None,
+    dataset_path=None,
+    meta_file_train=None,
+    meta_file_val=None,
+    disable_cuda=False,
+    no_eval=False,
+):
+    use_cuda = torch.cuda.is_available() and not disable_cuda

-use_cuda = torch.cuda.is_available() and not args.disable_cuda
-
-c_dataset = load_config(args.config_dataset_path)
-
-meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
-
-if meta_data_eval is None:
-    wav_files = meta_data_train
-else:
-    wav_files = meta_data_train + meta_data_eval
-
-encoder_manager = SpeakerManager(
-    encoder_model_path=args.model_path,
-    encoder_config_path=args.config_path,
-    d_vectors_file_path=args.old_file,
-    use_cuda=use_cuda,
-)
-
-class_name_key = encoder_manager.encoder_config.class_name_key
-
-# compute speaker embeddings
-speaker_mapping = {}
-for idx, wav_file in enumerate(tqdm(wav_files)):
-    if isinstance(wav_file, dict):
-        class_name = wav_file[class_name_key]
-        wav_file = wav_file["audio_file"]
+    if config_dataset_path is not None:
+        c_dataset = load_config(config_dataset_path)
+        meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
    else:
-        class_name = None
+        c_dataset = BaseDatasetConfig()
+        c_dataset.formatter = formatter_name
+        c_dataset.dataset_name = dataset_name
+        c_dataset.path = dataset_path
+        if meta_file_train is not None:
+            c_dataset.meta_file_train = meta_file_train
+        if meta_file_val is not None:
+            c_dataset.meta_file_val = meta_file_val
+        meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)

-    wav_file_name = os.path.basename(wav_file)
-    if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
-        # get the embedding from the old file
-        embedd = encoder_manager.get_embedding_by_clip(wav_file_name)
+    if meta_data_eval is None:
+        samples = meta_data_train
    else:
-        # extract the embedding
-        embedd = encoder_manager.compute_embedding_from_clip(wav_file)
+        samples = meta_data_train + meta_data_eval

-    # create speaker_mapping if target dataset is defined
-    speaker_mapping[wav_file_name] = {}
-    speaker_mapping[wav_file_name]["name"] = class_name
-    speaker_mapping[wav_file_name]["embedding"] = embedd
+    encoder_manager = SpeakerManager(
+        encoder_model_path=model_path,
+        encoder_config_path=config_path,
+        d_vectors_file_path=old_speakers_file,
+        use_cuda=use_cuda,
+    )

-if speaker_mapping:
-    # save speaker_mapping if target dataset is defined
-    if os.path.isdir(args.output_path):
-        mapping_file_path = os.path.join(args.output_path, "speakers.pth")
+    class_name_key = encoder_manager.encoder_config.class_name_key
+
+    # compute speaker embeddings
+    if old_speakers_file is not None and old_append:
+        speaker_mapping = encoder_manager.embeddings
    else:
-        mapping_file_path = args.output_path
+        speaker_mapping = {}

-    if os.path.dirname(mapping_file_path) != "":
-        os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
+    for fields in tqdm(samples):
+        class_name = fields[class_name_key]
+        audio_file = fields["audio_file"]
+        embedding_key = fields["audio_unique_name"]

-    save_file(speaker_mapping, mapping_file_path)
-    print("Speaker embeddings saved at:", mapping_file_path)
+        # Only update the speaker name when the embedding is already in the old file.
+        if embedding_key in speaker_mapping:
+            speaker_mapping[embedding_key]["name"] = class_name
+            continue
+
+        if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
+            # get the embedding from the old file
+            embedd = encoder_manager.get_embedding_by_clip(embedding_key)
+        else:
+            # extract the embedding
+            embedd = encoder_manager.compute_embedding_from_clip(audio_file)
+
+        # create speaker_mapping if target dataset is defined
+        speaker_mapping[embedding_key] = {}
+        speaker_mapping[embedding_key]["name"] = class_name
+        speaker_mapping[embedding_key]["embedding"] = embedd
+
+    if speaker_mapping:
+        # save speaker_mapping if target dataset is defined
+        if os.path.isdir(output_path):
+            mapping_file_path = os.path.join(output_path, "speakers.pth")
+        else:
+            mapping_file_path = output_path
+
+        if os.path.dirname(mapping_file_path) != "":
+            os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
+
+        save_file(speaker_mapping, mapping_file_path)
+        print("Speaker embeddings saved at:", mapping_file_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
+        """
+        Example runs:
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
+
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
+        """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        help="Path to model checkpoint file. It defaults to the released speaker encoder.",
+        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        help="Path to model config file. It defaults to the released speaker encoder config.",
+        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+    )
+    parser.add_argument(
+        "--config_dataset_path",
+        type=str,
+        help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
+        default=None,
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        help="Path for output `pth` or `json` file.",
+        default="speakers.pth",
+    )
+    parser.add_argument(
+        "--old_file",
+        type=str,
+        help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
+        default=None,
+    )
+    parser.add_argument(
+        "--old_append",
+        help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
+    parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
+    parser.add_argument(
+        "--formatter_name",
+        type=str,
+        help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        help="Path to the dataset. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_train",
+        type=str,
+        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_val",
+        type=str,
+        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    args = parser.parse_args()
+
+    compute_embeddings(
+        args.model_path,
+        args.config_path,
+        args.output_path,
+        old_speakers_file=args.old_file,
+        old_append=args.old_append,
+        config_dataset_path=args.config_dataset_path,
+        formatter_name=args.formatter_name,
+        dataset_name=args.dataset_name,
+        dataset_path=args.dataset_path,
+        meta_file_train=args.meta_file_train,
+        meta_file_val=args.meta_file_val,
+        disable_cuda=args.disable_cuda,
+        no_eval=args.no_eval,
+    )
--- a/TTS/bin/eval_encoder.py
+++ b/TTS/bin/eval_encoder.py
@ -10,7 +10,6 @@ from TTS.tts.utils.speakers import SpeakerManager


 def compute_encoder_accuracy(dataset_items, encoder_manager):
-
    class_name_key = encoder_manager.encoder_config.class_name_key
    map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)

--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@ -15,6 +15,7 @@ from TTS.tts.models import setup_model
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
+from TTS.utils.audio.numpy_transforms import quantize
 from TTS.utils.generic_utils import count_parameters

 use_cuda = torch.cuda.is_available()
@ -37,7 +38,7 @@ def setup_loader(ap, r, verbose=False):
        precompute_num_workers=0,
        use_noise_augment=False,
        verbose=verbose,
-        speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None,
+        speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
        d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
    )

@ -159,12 +160,11 @@ def inference(


 def extract_spectrograms(
-    data_loader, model, ap, output_path, quantized_wav=False, save_audio=False, debug=False, metada_name="metada.txt"
+    data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
 ):
    model.eval()
    export_metadata = []
    for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
-
        # format data
        (
            text_input,
@ -197,8 +197,8 @@ def extract_spectrograms(
            _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)

            # quantize and save wav
-            if quantized_wav:
-                wavq = ap.quantize(wav)
+            if quantize_bits > 0:
+                wavq = quantize(wav, quantize_bits)
                np.save(wavq_path, wavq)

            # save TTS mel
@ -264,7 +264,7 @@ def main(args):  # pylint: disable=redefined-outer-name
        model,
        ap,
        args.output_path,
-        quantized_wav=args.quantized,
+        quantize_bits=args.quantize_bits,
        save_audio=args.save_audio,
        debug=args.debug,
        metada_name="metada.txt",
@ -278,7 +278,7 @@ if __name__ == "__main__":
    parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
-    parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
+    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
    args = parser.parse_args()

--- a/TTS/bin/find_unique_phonemes.py
+++ b/TTS/bin/find_unique_phonemes.py
@ -7,30 +7,25 @@ from tqdm.contrib.concurrent import process_map

 from TTS.config import load_config
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
-
-phonemizer = Gruut(language="en-us")
+from TTS.tts.utils.text.phonemizers import Gruut


 def compute_phonemes(item):
-    try:
-        text = item[0]
-        ph = phonemizer.phonemize(text).split("|")
-    except:
-        return []
-    return list(set(ph))
+    text = item["text"]
+    ph = phonemizer.phonemize(text).replace("|", "")
+    return set(list(ph))


 def main():
    # pylint: disable=W0601
-    global c
+    global c, phonemizer
    # pylint: disable=bad-option-value
    parser = argparse.ArgumentParser(
        description="""Find all the unique characters or phonemes in a dataset.\n\n"""
        """
    Example runs:

-    python TTS/bin/find_unique_chars.py --config_path config.json
+    python TTS/bin/find_unique_phonemes.py --config_path config.json
    """,
        formatter_class=RawTextHelpFormatter,
    )
@ -46,15 +41,24 @@ def main():
    items = train_items + eval_items
    print("Num items:", len(items))

-    is_lang_def = all(item["language"] for item in items)
+    language_list = [item["language"] for item in items]
+    is_lang_def = all(language_list)

    if not c.phoneme_language or not is_lang_def:
        raise ValueError("Phoneme language must be defined in config.")

+    if not language_list.count(language_list[0]) == len(language_list):
+        raise ValueError(
+            "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
+        )
+
+    phonemizer = Gruut(language=language_list[0], keep_puncs=True)
+
    phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
    phones = []
    for ph in phonemes:
        phones.extend(ph)
+
    phones = set(phones)
    lower_phones = filter(lambda c: c.islower(), phones)
    phones_force_lower = [c.lower() for c in phones]
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@ -1,50 +1,75 @@
 import argparse
 import glob
+import multiprocessing
 import os
 import pathlib

+import torch
 from tqdm import tqdm

 from TTS.utils.vad import get_vad_model_and_utils, remove_silence

+torch.set_num_threads(1)
+

 def adjust_path_and_remove_silence(audio_path):
    output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
    # ignore if the file exists
    if os.path.exists(output_path) and not args.force:
-        return output_path
+        return output_path, False

    # create all directory structure
    pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    # remove the silence and save the audio
-    output_path = remove_silence(
+    output_path, is_speech = remove_silence(
        model_and_utils,
        audio_path,
        output_path,
        trim_just_beginning_and_end=args.trim_just_beginning_and_end,
        use_cuda=args.use_cuda,
    )
-
-    return output_path
+    return output_path, is_speech


 def preprocess_audios():
    files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
    print("> Number of files: ", len(files))
    if not args.force:
-        print("> Ignoring files that already exist in the output directory.")
+        print("> Ignoring files that already exist in the output idrectory.")

    if args.trim_just_beginning_and_end:
        print("> Trimming just the beginning and the end with nonspeech parts.")
    else:
        print("> Trimming all nonspeech parts.")

+    filtered_files = []
    if files:
        # create threads
        # num_threads = multiprocessing.cpu_count()
        # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
-        for f in tqdm(files):
-            adjust_path_and_remove_silence(f)
+
+        if args.num_processes > 1:
+            with multiprocessing.Pool(processes=args.num_processes) as pool:
+                results = list(
+                    tqdm(
+                        pool.imap_unordered(adjust_path_and_remove_silence, files),
+                        total=len(files),
+                        desc="Processing audio files",
+                    )
+                )
+            for output_path, is_speech in results:
+                if not is_speech:
+                    filtered_files.append(output_path)
+        else:
+            for f in tqdm(files):
+                output_path, is_speech = adjust_path_and_remove_silence(f)
+                if not is_speech:
+                    filtered_files.append(output_path)
+
+        # write files that do not have speech
+        with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
+            for file in filtered_files:
+                f.write(str(file) + "\n")
    else:
        print("> No files Found !")

@ -53,10 +78,8 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
    )
-    parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
-    parser.add_argument(
-        "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
-    )
+    parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
+    parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
    parser.add_argument(
        "-g",
@ -79,7 +102,23 @@ if __name__ == "__main__":
        default=False,
        help="If True use cuda",
    )
+    parser.add_argument(
+        "--use_onnx",
+        type=bool,
+        default=False,
+        help="If True use onnx",
+    )
+    parser.add_argument(
+        "--num_processes",
+        type=int,
+        default=1,
+        help="Number of processes to use",
+    )
    args = parser.parse_args()
+
+    if args.output_dir == "":
+        args.output_dir = args.input_dir
+
    # load the model and utils
-    model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
+    model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
    preprocess_audios()
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@ -2,8 +2,8 @@ import argparse
 import glob
 import os
 from argparse import RawTextHelpFormatter
-from distutils.dir_util import copy_tree
 from multiprocessing import Pool
+from shutil import copytree

 import librosa
 import soundfile as sf
@ -16,8 +16,25 @@ def resample_file(func_args):
    sf.write(filename, y, sr)


-if __name__ == "__main__":
+def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
+    if output_dir:
+        print("Recursively copying the input folder...")
+        copytree(input_dir, output_dir)
+        input_dir = output_dir

+    print("Resampling the audio files...")
+    audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
+    print(f"Found {len(audio_files)} files...")
+    audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
+    with Pool(processes=n_jobs) as p:
+        with tqdm(total=len(audio_files)) as pbar:
+            for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
+                pbar.update()
+
+    print("Done !")
+
+
+if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""Resample a folder recusively with librosa
                       Can be used in place or create a copy of the folder as an output.\n\n
@ -70,18 +87,4 @@ if __name__ == "__main__":

    args = parser.parse_args()

-    if args.output_dir:
-        print("Recursively copying the input folder...")
-        copy_tree(args.input_dir, args.output_dir)
-        args.input_dir = args.output_dir
-
-    print("Resampling the audio files...")
-    audio_files = glob.glob(os.path.join(args.input_dir, f"**/*.{args.file_ext}"), recursive=True)
-    print(f"Found {len(audio_files)} files...")
-    audio_files = list(zip(audio_files, len(audio_files) * [args.output_sr]))
-    with Pool(processes=args.n_jobs) as p:
-        with tqdm(total=len(audio_files)) as pbar:
-            for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
-                pbar.update()
-
-    print("Done !")
+    resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -2,14 +2,133 @@
 # -*- coding: utf-8 -*-

 import argparse
+import contextlib
 import sys
 from argparse import RawTextHelpFormatter

 # pylint: disable=redefined-outer-name, unused-argument
 from pathlib import Path

-from TTS.utils.manage import ModelManager
-from TTS.utils.synthesizer import Synthesizer
+description = """
+Synthesize speech on command line.
+
+You can either use your trained model or choose a model from the provided list.
+
+If you don't specify any models, then it uses LJSpeech based English model.
+
+#### Single Speaker Models
+
+- List provided models:
+
+  ```
+  $ tts --list_models
+  ```
+
+- Get model info (for both tts_models and vocoder_models):
+
+  - Query by type/name:
+    The model_info_by_name uses the name as it from the --list_models.
+    ```
+    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+    ```
+    For example:
+    ```
+    $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+    $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+    ```
+  - Query by type/idx:
+    The model_query_idx uses the corresponding idx from --list_models.
+
+    ```
+    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
+    ```
+
+    For example:
+
+    ```
+    $ tts --model_info_by_idx tts_models/3
+    ```
+
+  - Query info for model info by full name:
+    ```
+    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+    ```
+
+- Run TTS with default models:
+
+  ```
+  $ tts --text "Text for TTS" --out_path output/path/speech.wav
+  ```
+
+- Run TTS and pipe out the generated TTS wav file data:
+
+  ```
+  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+  ```
+
+- Run a TTS model with its default vocoder model:
+
+  ```
+  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```
+
+  For example:
+
+  ```
+  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
+  ```
+
+- Run with specific TTS and vocoder models from the list:
+
+  ```
+  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```
+
+  For example:
+
+  ```
+  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+  ```
+
+- Run your own TTS model (Using Griffin-Lim Vocoder):
+
+  ```
+  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+  ```
+
+- Run your own TTS and Vocoder models:
+
+  ```
+  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+      --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+  ```
+
+#### Multi-speaker Models
+
+- List the available speakers and choose a <speaker_id> among them:
+
+  ```
+  $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
+  ```
+
+- Run the multi-speaker TTS model with the target speaker ID:
+
+  ```
+  $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
+  ```
+
+- Run your own multi-speaker TTS model:
+
+  ```
+  $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+  ```
+
+### Voice Conversion Models
+
+```
+$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
+```
+"""


 def str2bool(v):
@ -23,86 +142,6 @@ def str2bool(v):


 def main():
-    description = """Synthesize speech on command line.
-
-You can either use your trained model or choose a model from the provided list.
-
-If you don't specify any models, then it uses LJSpeech based English model.
-
-## Example Runs
-
-### Single Speaker Models
-
- List provided models:
-
-    ```
-    $ tts --list_models
-    ```
-
- Query info for model info by idx:
-
-    ```
-    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
-    ```
-
- Query info for model info by full name:
-
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
-
- Run TTS with default models:
-
-    ```
-    $ tts --text "Text for TTS"
-    ```
-
- Run a TTS model with its default vocoder model:
-
-    ```
-    $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>
-    ```
-
- Run with specific TTS and vocoder models from the list:
-
-    ```
-    $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --output_path
-    ```
-
- Run your own TTS model (Using Griffin-Lim Vocoder):
-
-    ```
-    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
-    ```
-
- Run your own TTS and Vocoder models:
-    ```
-    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
-        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
-    ```
-
-### Multi-speaker Models
-
- List the available speakers and choose as <speaker_id> among them:
-
-    ```
-    $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
-    ```
-
- Run the multi-speaker TTS model with the target speaker ID:
-
-    ```
-    $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
-    ```
-
- Run your own multi-speaker TTS model:
-
-    ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
-    ```
-    """
-    # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
-    # documentation in sync more easily.
    parser = argparse.ArgumentParser(
        description=description.replace("    ```\n", ""),
        formatter_class=RawTextHelpFormatter,
@ -162,6 +201,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
        help="Output wav file path.",
    )
    parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
+    parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
    parser.add_argument(
        "--vocoder_path",
        type=str,
@ -176,7 +216,15 @@ If you don't specify any models, then it uses LJSpeech based English model.
        default=None,
    )
    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
-
+    parser.add_argument(
+        "--pipe_out",
+        help="stdout the generated TTS wav file for shell pipe.",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+    )
+    
    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
    parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
@ -238,6 +286,34 @@ If you don't specify any models, then it uses LJSpeech based English model.
        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
        default=None,
    )
+    parser.add_argument(
+        "--progress_bar",
+        type=str2bool,
+        help="If true shows a progress bar for the model download. Defaults to True",
+        default=True,
+    )
+
+    # voice conversion args
+    parser.add_argument(
+        "--source_wav",
+        type=str,
+        default=None,
+        help="Original audio file to convert in the voice of the target_wav",
+    )
+    parser.add_argument(
+        "--target_wav",
+        type=str,
+        default=None,
+        help="Target audio file to convert in the voice of the source_wav",
+    )
+
+    parser.add_argument(
+        "--voice_dir",
+        type=str,
+        default=None,
+        help="Voice dir for tortoise model",
+    )
+
    args = parser.parse_args()

    # print the description if either text or list_models is not set
@ -249,118 +325,169 @@ If you don't specify any models, then it uses LJSpeech based English model.
        args.reference_wav,
        args.model_info_by_idx,
        args.model_info_by_name,
+        args.source_wav,
+        args.target_wav,
    ]
    if not any(check_args):
        parser.parse_args(["-h"])

-    # load model manager
-    path = Path(__file__).parent / "../.models.json"
-    manager = ModelManager(path)
+    pipe_out = sys.stdout if args.pipe_out else None

-    model_path = None
-    config_path = None
-    speakers_file_path = None
-    language_ids_file_path = None
-    vocoder_path = None
-    vocoder_config_path = None
-    encoder_path = None
-    encoder_config_path = None
+    with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
+        # Late-import to make things load faster
+        from TTS.api import TTS
+        from TTS.utils.manage import ModelManager
+        from TTS.utils.synthesizer import Synthesizer

-    # CASE1 #list : list pre-trained TTS models
-    if args.list_models:
-        manager.list_models()
-        sys.exit()
+        # load model manager
+        path = Path(__file__).parent / "../.models.json"
+        manager = ModelManager(path, progress_bar=args.progress_bar)
+        api = TTS()

-    # CASE2 #info : model info of pre-trained TTS models
-    if args.model_info_by_idx:
-        model_query = args.model_info_by_idx
-        manager.model_info_by_idx(model_query)
-        sys.exit()
+        tts_path = None
+        tts_config_path = None
+        speakers_file_path = None
+        language_ids_file_path = None
+        vocoder_path = None
+        vocoder_config_path = None
+        encoder_path = None
+        encoder_config_path = None
+        vc_path = None
+        vc_config_path = None
+        model_dir = None

-    if args.model_info_by_name:
-        model_query_full_name = args.model_info_by_name
-        manager.model_info_by_full_name(model_query_full_name)
-        sys.exit()
+        # CASE1 #list : list pre-trained TTS models
+        if args.list_models:
+            manager.list_models()
+            sys.exit()

-    # CASE3: load pre-trained model paths
-    if args.model_name is not None and not args.model_path:
-        model_path, config_path, model_item = manager.download_model(args.model_name)
-        args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+        # CASE2 #info : model info for pre-trained TTS models
+        if args.model_info_by_idx:
+            model_query = args.model_info_by_idx
+            manager.model_info_by_idx(model_query)
+            sys.exit()

-    if args.vocoder_name is not None and not args.vocoder_path:
-        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
+        if args.model_info_by_name:
+            model_query_full_name = args.model_info_by_name
+            manager.model_info_by_full_name(model_query_full_name)
+            sys.exit()

-    # CASE4: set custom model paths
-    if args.model_path is not None:
-        model_path = args.model_path
-        config_path = args.config_path
-        speakers_file_path = args.speakers_file_path
-        language_ids_file_path = args.language_ids_file_path
+        # CASE3: load pre-trained model paths
+        if args.model_name is not None and not args.model_path:
+            model_path, config_path, model_item = manager.download_model(args.model_name)
+            # tts model
+            if model_item["model_type"] == "tts_models":
+                tts_path = model_path
+                tts_config_path = config_path
+                if "default_vocoder" in model_item:
+                    args.vocoder_name = (
+                        model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+                    )

-    if args.vocoder_path is not None:
-        vocoder_path = args.vocoder_path
-        vocoder_config_path = args.vocoder_config_path
+            # voice conversion model
+            if model_item["model_type"] == "voice_conversion_models":
+                vc_path = model_path
+                vc_config_path = config_path

-    if args.encoder_path is not None:
-        encoder_path = args.encoder_path
-        encoder_config_path = args.encoder_config_path
+            # tts model with multiple files to be loaded from the directory path
+            if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
+                model_dir = model_path
+                tts_path = None
+                tts_config_path = None
+                args.vocoder_name = None

-    # load models
-    synthesizer = Synthesizer(
-        model_path,
-        config_path,
-        speakers_file_path,
-        language_ids_file_path,
-        vocoder_path,
-        vocoder_config_path,
-        encoder_path,
-        encoder_config_path,
-        args.use_cuda,
-    )
+        # load vocoder
+        if args.vocoder_name is not None and not args.vocoder_path:
+            vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

-    # query speaker ids of a multi-speaker model.
-    if args.list_speaker_idxs:
-        print(
-            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
-        )
-        print(synthesizer.tts_model.speaker_manager.ids)
-        return
+        # CASE4: set custom model paths
+        if args.model_path is not None:
+            tts_path = args.model_path
+            tts_config_path = args.config_path
+            speakers_file_path = args.speakers_file_path
+            language_ids_file_path = args.language_ids_file_path

-    # query langauge ids of a multi-lingual model.
-    if args.list_language_idxs:
-        print(
-            " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
-        )
-        print(synthesizer.tts_model.language_manager.ids)
-        return
+        if args.vocoder_path is not None:
+            vocoder_path = args.vocoder_path
+            vocoder_config_path = args.vocoder_config_path

-    # check the arguments against a multi-speaker model.
-    if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
-        print(
-            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
-            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
-        )
-        return
+        if args.encoder_path is not None:
+            encoder_path = args.encoder_path
+            encoder_config_path = args.encoder_config_path

-    # RUN THE SYNTHESIS
-    if args.text:
-        print(" > Text: {}".format(args.text))
+        device = args.device
+        if args.use_cuda:
+            device = "cuda"

-    # kick it
-    wav = synthesizer.tts(
-        args.text,
-        args.speaker_idx,
-        args.language_idx,
-        args.speaker_wav,
-        reference_wav=args.reference_wav,
-        style_wav=args.capacitron_style_wav,
-        style_text=args.capacitron_style_text,
-        reference_speaker_name=args.reference_speaker_idx,
-    )
+        # load models
+        synthesizer = Synthesizer(
+            tts_path,
+            tts_config_path,
+            speakers_file_path,
+            language_ids_file_path,
+            vocoder_path,
+            vocoder_config_path,
+            encoder_path,
+            encoder_config_path,
+            vc_path,
+            vc_config_path,
+            model_dir,
+            args.voice_dir,
+        ).to(device)

-    # save the results
-    print(" > Saving output to {}".format(args.out_path))
-    synthesizer.save_wav(wav, args.out_path)
+        # query speaker ids of a multi-speaker model.
+        if args.list_speaker_idxs:
+            print(
+                " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
+            )
+            print(synthesizer.tts_model.speaker_manager.name_to_id)
+            return
+
+        # query langauge ids of a multi-lingual model.
+        if args.list_language_idxs:
+            print(
+                " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
+            )
+            print(synthesizer.tts_model.language_manager.name_to_id)
+            return
+
+        # check the arguments against a multi-speaker model.
+        if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
+            print(
+                " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
+                "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
+            )
+            return
+
+        # RUN THE SYNTHESIS
+        if args.text:
+            print(" > Text: {}".format(args.text))
+
+        # kick it
+        if tts_path is not None:
+            wav = synthesizer.tts(
+                args.text,
+                speaker_name=args.speaker_idx,
+                language_name=args.language_idx,
+                speaker_wav=args.speaker_wav,
+                reference_wav=args.reference_wav,
+                style_wav=args.capacitron_style_wav,
+                style_text=args.capacitron_style_text,
+                reference_speaker_name=args.reference_speaker_idx,
+            )
+        elif vc_path is not None:
+            wav = synthesizer.voice_conversion(
+                source_wav=args.source_wav,
+                target_wav=args.target_wav,
+            )
+        elif model_dir is not None:
+            wav = synthesizer.tts(
+                args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
+            )
+
+        # save the results
+        print(" > Saving output to {}".format(args.out_path))
+        synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)


 if __name__ == "__main__":
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@ -8,17 +8,17 @@ import traceback

 import torch
 from torch.utils.data import DataLoader
+from trainer.io import copy_model_files, save_best_model, save_checkpoint
 from trainer.torch import NoamLR
 from trainer.trainer_utils import get_optimizer

 from TTS.encoder.dataset import EncoderDataset
-from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
+from TTS.encoder.utils.generic_utils import setup_encoder_model
 from TTS.encoder.utils.training import init_training
 from TTS.encoder.utils.visual import plot_embeddings
 from TTS.tts.datasets import load_tts_samples
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
-from TTS.utils.io import copy_model_files
 from TTS.utils.samplers import PerfectBatchSampler
 from TTS.utils.training import check_update

@ -125,7 +125,7 @@ def evaluation(model, criterion, data_loader, global_step):

 def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
    model.train()
-    best_loss = float("inf")
+    best_loss = {"train_loss": None, "eval_loss": float("inf")}
    avg_loader_time = 0
    end_time = time.time()
    for epoch in range(c.epochs):
@ -222,7 +222,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,

            if global_step % c.save_step == 0:
                # save model
-                save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
+                save_checkpoint(
+                    c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
+                )

            end_time = time.time()

@ -245,7 +247,18 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
                flush=True,
            )
            # save the best checkpoint
-            best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
+            best_loss = save_best_model(
+                {"train_loss": None, "eval_loss": eval_loss},
+                best_loss,
+                c,
+                model,
+                optimizer,
+                None,
+                global_step,
+                epoch,
+                OUT_PATH,
+                criterion=criterion.state_dict(),
+            )
            model.train()

    return best_loss, global_step
@ -276,7 +289,7 @@ def main(args):  # pylint: disable=redefined-outer-name

    if c.loss == "softmaxproto" and c.model != "speaker_encoder":
        c.map_classid_to_classname = map_classid_to_classname
-        copy_model_files(c, OUT_PATH)
+        copy_model_files(c, OUT_PATH, new_fields={})

    if args.restore_path:
        criterion, args.restore_step = model.load_checkpoint(
--- a/TTS/config/init.py
+++ b/TTS/config/init.py
@ -16,12 +16,9 @@ def read_json_with_comments(json_path):
    # fallback to json
    with fsspec.open(json_path, "r", encoding="utf-8") as f:
        input_str = f.read()
-    # handle comments
-    input_str = re.sub(r"\\\n", "", input_str)
-    input_str = re.sub(r"//.*\n", "\n", input_str)
-    data = json.loads(input_str)
-    return data
-
+    # handle comments but not urls with //
+    input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
+    return json.loads(input_str)

 def register_config(model_name: str) -> Coqpit:
    """Find the right config for the given model name.
@ -37,7 +34,13 @@ def register_config(model_name: str) -> Coqpit:
    """
    config_class = None
    config_name = model_name + "_config"
-    paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"]
+
+    # TODO: fix this
+    if model_name == "xtts":
+        from TTS.tts.configs.xtts_config import XttsConfig
+
+        config_class = XttsConfig
+    paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
    for path in paths:
        try:
            config_class = find_module(path, config_name)
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@ -62,7 +62,7 @@ class BaseAudioConfig(Coqpit):
            Maximum frequency of the F0 frames. Defaults to ```640```.

        pitch_fmin (float, optional):
-            Minimum frequency of the F0 frames. Defaults to ```0```.
+            Minimum frequency of the F0 frames. Defaults to ```1```.

        trim_db (int):
            Silence threshold used for silence trimming. Defaults to 45.
@ -144,7 +144,7 @@ class BaseAudioConfig(Coqpit):
    do_amp_to_db_mel: bool = True
    # f0 params
    pitch_fmax: float = 640.0
-    pitch_fmin: float = 0.0
+    pitch_fmin: float = 1.0
    # normalization params
    signal_norm: bool = True
    min_level_db: int = -100
@ -193,21 +193,27 @@ class BaseDatasetConfig(Coqpit):
    """Base config for TTS datasets.

    Args:
-        name (str):
-            Dataset name that defines the preprocessor in use. Defaults to None.
+        formatter (str):
+            Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
+
+        dataset_name (str):
+            Unique name for the dataset. Defaults to `""`.

        path (str):
-            Root path to the dataset files. Defaults to None.
+            Root path to the dataset files. Defaults to `""`.

        meta_file_train (str):
            Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
-            Defaults to None.
+            Defaults to `""`.

        ignored_speakers (List):
            List of speakers IDs that are not used at the training. Default None.

        language (str):
-            Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to None.
+            Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
+
+        phonemizer (str):
+            Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.

        meta_file_val (str):
            Name of the dataset meta file that defines the instances used at validation.
@ -217,11 +223,13 @@ class BaseDatasetConfig(Coqpit):
            train the duration predictor.
    """

-    name: str = ""
+    formatter: str = ""
+    dataset_name: str = ""
    path: str = ""
    meta_file_train: str = ""
    ignored_speakers: List[str] = None
    language: str = ""
+    phonemizer: str = ""
    meta_file_val: str = ""
    meta_file_attn_mask: str = ""

@ -230,7 +238,7 @@ class BaseDatasetConfig(Coqpit):
    ):
        """Check config fields"""
        c = asdict(self)
-        check_argument("name", c, restricted=True)
+        check_argument("formatter", c, restricted=True)
        check_argument("path", c, restricted=True)
        check_argument("meta_file_train", c, restricted=True)
        check_argument("meta_file_val", c, restricted=False)
--- a/TTS/demos/xtts_ft_demo/requirements.txt
+++ b/TTS/demos/xtts_ft_demo/requirements.txt
@ -0,0 +1,2 @@
+faster_whisper==0.9.0
+gradio==4.7.1
--- a/TTS/demos/xtts_ft_demo/utils/formatter.py
+++ b/TTS/demos/xtts_ft_demo/utils/formatter.py
@ -0,0 +1,160 @@
+import os
+import gc
+import torchaudio
+import pandas
+from faster_whisper import WhisperModel
+from glob import glob
+
+from tqdm import tqdm
+
+import torch
+import torchaudio
+# torch.set_num_threads(1)
+
+from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
+
+torch.set_num_threads(16)
+
+
+import os
+
+audio_types = (".wav", ".mp3", ".flac")
+
+
+def list_audios(basePath, contains=None):
+    # return the set of files that are valid
+    return list_files(basePath, validExts=audio_types, contains=contains)
+
+def list_files(basePath, validExts=None, contains=None):
+    # loop over the directory structure
+    for (rootDir, dirNames, filenames) in os.walk(basePath):
+        # loop over the filenames in the current directory
+        for filename in filenames:
+            # if the contains string is not none and the filename does not contain
+            # the supplied string, then ignore the file
+            if contains is not None and filename.find(contains) == -1:
+                continue
+
+            # determine the file extension of the current file
+            ext = filename[filename.rfind("."):].lower()
+
+            # check to see if the file is an audio and should be processed
+            if validExts is None or ext.endswith(validExts):
+                # construct the path to the audio and yield it
+                audioPath = os.path.join(rootDir, filename)
+                yield audioPath
+
+def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
+    audio_total_size = 0
+    # make sure that ooutput file exists
+    os.makedirs(out_path, exist_ok=True)
+
+    # Loading Whisper
+    device = "cuda" if torch.cuda.is_available() else "cpu" 
+
+    print("Loading Whisper Model!")
+    asr_model = WhisperModel("large-v2", device=device, compute_type="float16")
+
+    metadata = {"audio_file": [], "text": [], "speaker_name": []}
+
+    if gradio_progress is not None:
+        tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
+    else:
+        tqdm_object = tqdm(audio_files)
+
+    for audio_path in tqdm_object:
+        wav, sr = torchaudio.load(audio_path)
+        # stereo to mono if needed
+        if wav.size(0) != 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+
+        wav = wav.squeeze()
+        audio_total_size += (wav.size(-1) / sr)
+
+        segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
+        segments = list(segments)
+        i = 0
+        sentence = ""
+        sentence_start = None
+        first_word = True
+        # added all segments words in a unique list
+        words_list = []
+        for _, segment in enumerate(segments):
+            words = list(segment.words)
+            words_list.extend(words)
+
+        # process each word
+        for word_idx, word in enumerate(words_list):
+            if first_word:
+                sentence_start = word.start
+                # If it is the first sentence, add buffer or get the begining of the file
+                if word_idx == 0:
+                    sentence_start = max(sentence_start - buffer, 0)  # Add buffer to the sentence start
+                else:
+                    # get previous sentence end
+                    previous_word_end = words_list[word_idx - 1].end
+                    # add buffer or get the silence midle between the previous sentence and the current one
+                    sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)
+
+                sentence = word.word
+                first_word = False
+            else:
+                sentence += word.word
+
+            if word.word[-1] in ["!", ".", "?"]:
+                sentence = sentence[1:]
+                # Expand number and abbreviations plus normalization
+                sentence = multilingual_cleaners(sentence, target_language)
+                audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))
+
+                audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
+
+                # Check for the next word's existence
+                if word_idx + 1 < len(words_list):
+                    next_word_start = words_list[word_idx + 1].start
+                else:
+                    # If don't have more words it means that it is the last sentence then use the audio len as next word start
+                    next_word_start = (wav.shape[0] - 1) / sr
+
+                # Average the current word end and next word start
+                word_end = min((word.end + next_word_start) / 2, word.end + buffer)
+                
+                absoulte_path = os.path.join(out_path, audio_file)
+                os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
+                i += 1
+                first_word = True
+
+                audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
+                # if the audio is too short ignore it (i.e < 0.33 seconds)
+                if audio.size(-1) >= sr/3:
+                    torchaudio.save(absoulte_path,
+                        audio,
+                        sr
+                    )
+                else:
+                    continue
+
+                metadata["audio_file"].append(audio_file)
+                metadata["text"].append(sentence)
+                metadata["speaker_name"].append(speaker_name)
+
+    df = pandas.DataFrame(metadata)
+    df = df.sample(frac=1)
+    num_val_samples = int(len(df)*eval_percentage)
+
+    df_eval = df[:num_val_samples]
+    df_train = df[num_val_samples:]
+
+    df_train = df_train.sort_values('audio_file')
+    train_metadata_path = os.path.join(out_path, "metadata_train.csv")
+    df_train.to_csv(train_metadata_path, sep="|", index=False)
+
+    eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
+    df_eval = df_eval.sort_values('audio_file')
+    df_eval.to_csv(eval_metadata_path, sep="|", index=False)
+
+    # deallocate VRAM and RAM
+    del asr_model, df_train, df_eval, df, metadata
+    gc.collect()
+
+    return train_metadata_path, eval_metadata_path, audio_total_size
--- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py
+++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py
@ -0,0 +1,172 @@
+import os
+import gc
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.utils.manage import ModelManager
+
+
+def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995):
+    #  Logging parameters
+    RUN_NAME = "GPT_XTTS_FT"
+    PROJECT_NAME = "XTTS_trainer"
+    DASHBOARD_LOGGER = "tensorboard"
+    LOGGER_URI = None
+
+    # Set here the path that the checkpoints will be saved. Default: ./run/training/
+    OUT_PATH = os.path.join(output_path, "run", "training")
+
+    # Training Parameters
+    OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
+    START_WITH_EVAL = False  # if True it will star with evaluation
+    BATCH_SIZE = batch_size  # set here the batch size
+    GRAD_ACUMM_STEPS = grad_acumm  # set here the grad accumulation steps
+
+
+    # Define here the dataset that you want to use for the fine-tuning on.
+    config_dataset = BaseDatasetConfig(
+        formatter="coqui",
+        dataset_name="ft_dataset",
+        path=os.path.dirname(train_csv),
+        meta_file_train=train_csv,
+        meta_file_val=eval_csv,
+        language=language,
+    )
+
+    # Add here the configs of the datasets
+    DATASETS_CONFIG_LIST = [config_dataset]
+
+    # Define the path where XTTS v2.0.1 files will be downloaded
+    CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
+    os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
+
+
+    # DVAE files
+    DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
+    MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
+
+    # Set the path to the downloaded files
+    DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
+    MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
+
+    # download DVAE files if needed
+    if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
+        print(" > Downloading DVAE files!")
+        ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
+
+
+    # Download XTTS v2.0 checkpoint if needed
+    TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
+    XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
+    XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"
+
+    # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
+    TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
+    XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file
+    XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK))  # config.json file
+
+    # download XTTS v2.0 files if needed
+    if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
+        print(" > Downloading XTTS v2.0 files!")
+        ModelManager._download_model_files(
+            [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+        )
+
+    # init args and config
+    model_args = GPTArgs(
+        max_conditioning_length=132300,  # 6 secs
+        min_conditioning_length=66150,  # 3 secs
+        debug_loading_failures=False,
+        max_wav_length=max_audio_length,  # ~11.6 seconds
+        max_text_length=200,
+        mel_norm_file=MEL_NORM_FILE,
+        dvae_checkpoint=DVAE_CHECKPOINT,
+        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+        tokenizer_file=TOKENIZER_FILE,
+        gpt_num_audio_tokens=1026,
+        gpt_start_audio_token=1024,
+        gpt_stop_audio_token=1025,
+        gpt_use_masking_gt_prompt_approach=True,
+        gpt_use_perceiver_resampler=True,
+    )
+    # define audio config
+    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
+    # training parameters config
+    config = GPTTrainerConfig(
+        epochs=num_epochs,
+        output_path=OUT_PATH,
+        model_args=model_args,
+        run_name=RUN_NAME,
+        project_name=PROJECT_NAME,
+        run_description="""
+            GPT XTTS training
+            """,
+        dashboard_logger=DASHBOARD_LOGGER,
+        logger_uri=LOGGER_URI,
+        audio=audio_config,
+        batch_size=BATCH_SIZE,
+        batch_group_size=48,
+        eval_batch_size=BATCH_SIZE,
+        num_loader_workers=8,
+        eval_split_max_size=256,
+        print_step=50,
+        plot_step=100,
+        log_model_step=100,
+        save_step=1000,
+        save_n_checkpoints=1,
+        save_checkpoints=True,
+        # target_loss="loss",
+        print_eval=False,
+        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+        optimizer="AdamW",
+        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+        lr=5e-06,  # learning rate
+        lr_scheduler="MultiStepLR",
+        # it was adjusted accordly for the new step scheme
+        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+        test_sentences=[],
+    )
+
+    # init the model from config
+    model = GPTTrainer.init_from_config(config)
+
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        DATASETS_CONFIG_LIST,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+
+    # init the trainer and 🚀
+    trainer = Trainer(
+        TrainerArgs(
+            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+            skip_train_epoch=False,
+            start_with_eval=START_WITH_EVAL,
+            grad_accum_steps=GRAD_ACUMM_STEPS,
+        ),
+        config,
+        output_path=OUT_PATH,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+    )
+    trainer.fit()
+
+    # get the longest text audio file to use as speaker reference
+    samples_len = [len(item["text"].split(" ")) for item in train_samples]
+    longest_text_idx =  samples_len.index(max(samples_len))
+    speaker_ref = train_samples[longest_text_idx]["audio_file"]
+
+    trainer_out_path = trainer.output_path
+
+    # deallocate VRAM and RAM
+    del model, trainer, train_samples, eval_samples
+    gc.collect()
+
+    return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref
--- a/TTS/demos/xtts_ft_demo/xtts_demo.py
+++ b/TTS/demos/xtts_ft_demo/xtts_demo.py
@ -0,0 +1,415 @@
+import argparse
+import os
+import sys
+import tempfile
+
+import gradio as gr
+import librosa.display
+import numpy as np
+
+import os
+import torch
+import torchaudio
+import traceback
+from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
+from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
+
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+
+
+def clear_gpu_cache():
+    # clear the GPU cache
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+XTTS_MODEL = None
+def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
+    global XTTS_MODEL
+    clear_gpu_cache()
+    if not xtts_checkpoint or not xtts_config or not xtts_vocab:
+        return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
+    config = XttsConfig()
+    config.load_json(xtts_config)
+    XTTS_MODEL = Xtts.init_from_config(config)
+    print("Loading XTTS model! ")
+    XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
+    if torch.cuda.is_available():
+        XTTS_MODEL.cuda()
+
+    print("Model Loaded!")
+    return "Model Loaded!"
+
+def run_tts(lang, tts_text, speaker_audio_file):
+    if XTTS_MODEL is None or not speaker_audio_file:
+        return "You need to run the previous step to load the model !!", None, None
+
+    gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
+    out = XTTS_MODEL.inference(
+        text=tts_text,
+        language=lang,
+        gpt_cond_latent=gpt_cond_latent,
+        speaker_embedding=speaker_embedding,
+        temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
+        length_penalty=XTTS_MODEL.config.length_penalty,
+        repetition_penalty=XTTS_MODEL.config.repetition_penalty,
+        top_k=XTTS_MODEL.config.top_k,
+        top_p=XTTS_MODEL.config.top_p,
+    )
+
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
+        out_path = fp.name
+        torchaudio.save(out_path, out["wav"], 24000)
+
+    return "Speech generated !", out_path, speaker_audio_file
+
+
+
+
+# define a logger to redirect 
+class Logger:
+    def __init__(self, filename="log.out"):
+        self.log_file = filename
+        self.terminal = sys.stdout
+        self.log = open(self.log_file, "w")
+
+    def write(self, message):
+        self.terminal.write(message)
+        self.log.write(message)
+
+    def flush(self):
+        self.terminal.flush()
+        self.log.flush()
+
+    def isatty(self):
+        return False
+
+# redirect stdout and stderr to a file
+sys.stdout = Logger()
+sys.stderr = sys.stdout
+
+
+# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+
+def read_logs():
+    sys.stdout.flush()
+    with open(sys.stdout.log_file, "r") as f:
+        return f.read()
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="""XTTS fine-tuning demo\n\n"""
+        """
+        Example runs:
+        python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port 
+        """,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        help="Port to run the gradio demo. Default: 5003",
+        default=5003,
+    )
+    parser.add_argument(
+        "--out_path",
+        type=str,
+        help="Output path (where data and checkpoints will be saved) Default: /tmp/xtts_ft/",
+        default="/tmp/xtts_ft/",
+    )
+
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        help="Number of epochs to train. Default: 10",
+        default=10,
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        help="Batch size. Default: 4",
+        default=4,
+    )
+    parser.add_argument(
+        "--grad_acumm",
+        type=int,
+        help="Grad accumulation steps. Default: 1",
+        default=1,
+    )
+    parser.add_argument(
+        "--max_audio_length",
+        type=int,
+        help="Max permitted audio size in seconds. Default: 11",
+        default=11,
+    )
+
+    args = parser.parse_args()
+
+    with gr.Blocks() as demo:
+        with gr.Tab("1 - Data processing"):
+            out_path = gr.Textbox(
+                label="Output path (where data and checkpoints will be saved):",
+                value=args.out_path,
+            )
+            # upload_file = gr.Audio(
+            #     sources="upload",
+            #     label="Select here the audio files that you want to use for XTTS trainining !",
+            #     type="filepath",
+            # )
+            upload_file = gr.File(
+                file_count="multiple",
+                label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)",
+            )
+            lang = gr.Dropdown(
+                label="Dataset Language",
+                value="en",
+                choices=[
+                    "en",
+                    "es",
+                    "fr",
+                    "de",
+                    "it",
+                    "pt",
+                    "pl",
+                    "tr",
+                    "ru",
+                    "nl",
+                    "cs",
+                    "ar",
+                    "zh",
+                    "hu",
+                    "ko",
+                    "ja"
+                ],
+            )
+            progress_data = gr.Label(
+                label="Progress:"
+            )
+            logs = gr.Textbox(
+                label="Logs:",
+                interactive=False,
+            )
+            demo.load(read_logs, None, logs, every=1)
+
+            prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
+        
+            def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)):
+                clear_gpu_cache()
+                out_path = os.path.join(out_path, "dataset")
+                os.makedirs(out_path, exist_ok=True)
+                if audio_path is None:
+                    return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", ""
+                else:
+                    try:
+                        train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress)
+                    except:
+                        traceback.print_exc()
+                        error = traceback.format_exc()
+                        return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", ""
+
+                clear_gpu_cache()
+
+                # if audio total len is less than 2 minutes raise an error
+                if audio_total_size < 120:
+                    message = "The sum of the duration of the audios that you provided should be at least 2 minutes!"
+                    print(message)
+                    return message, "", ""
+
+                print("Dataset Processed!")
+                return "Dataset Processed!", train_meta, eval_meta
+
+        with gr.Tab("2 - Fine-tuning XTTS Encoder"):
+            train_csv = gr.Textbox(
+                label="Train CSV:",
+            )
+            eval_csv = gr.Textbox(
+                label="Eval CSV:",
+            )
+            num_epochs =  gr.Slider(
+                label="Number of epochs:",
+                minimum=1,
+                maximum=100,
+                step=1,
+                value=args.num_epochs,
+            )
+            batch_size = gr.Slider(
+                label="Batch size:",
+                minimum=2,
+                maximum=512,
+                step=1,
+                value=args.batch_size,
+            )
+            grad_acumm = gr.Slider(
+                label="Grad accumulation steps:",
+                minimum=2,
+                maximum=128,
+                step=1,
+                value=args.grad_acumm,
+            )
+            max_audio_length = gr.Slider(
+                label="Max permitted audio size in seconds:",
+                minimum=2,
+                maximum=20,
+                step=1,
+                value=args.max_audio_length,
+            )
+            progress_train = gr.Label(
+                label="Progress:"
+            )
+            logs_tts_train = gr.Textbox(
+                label="Logs:",
+                interactive=False,
+            )
+            demo.load(read_logs, None, logs_tts_train, every=1)
+            train_btn = gr.Button(value="Step 2 - Run the training")
+
+            def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length):
+                clear_gpu_cache()
+                if not train_csv or not eval_csv:
+                    return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", ""
+                try:
+                    # convert seconds to waveform frames
+                    max_audio_length = int(max_audio_length * 22050)
+                    config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length)
+                except:
+                    traceback.print_exc()
+                    error = traceback.format_exc()
+                    return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", ""
+
+                # copy original files to avoid parameters changes issues
+                os.system(f"cp {config_path} {exp_path}")
+                os.system(f"cp {vocab_file} {exp_path}")
+
+                ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
+                print("Model training done!")
+                clear_gpu_cache()
+                return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav
+
+        with gr.Tab("3 - Inference"):
+            with gr.Row():
+                with gr.Column() as col1:
+                    xtts_checkpoint = gr.Textbox(
+                        label="XTTS checkpoint path:",
+                        value="",
+                    )
+                    xtts_config = gr.Textbox(
+                        label="XTTS config path:",
+                        value="",
+                    )
+
+                    xtts_vocab = gr.Textbox(
+                        label="XTTS vocab path:",
+                        value="",
+                    )
+                    progress_load = gr.Label(
+                        label="Progress:"
+                    )
+                    load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
+
+                with gr.Column() as col2:
+                    speaker_reference_audio = gr.Textbox(
+                        label="Speaker reference audio:",
+                        value="",
+                    )
+                    tts_language = gr.Dropdown(
+                        label="Language",
+                        value="en",
+                        choices=[
+                            "en",
+                            "es",
+                            "fr",
+                            "de",
+                            "it",
+                            "pt",
+                            "pl",
+                            "tr",
+                            "ru",
+                            "nl",
+                            "cs",
+                            "ar",
+                            "zh",
+                            "hu",
+                            "ko",
+                            "ja",
+                        ]
+                    )
+                    tts_text = gr.Textbox(
+                        label="Input Text.",
+                        value="This model sounds really good and above all, it's reasonably fast.",
+                    )
+                    tts_btn = gr.Button(value="Step 4 - Inference")
+
+                with gr.Column() as col3:
+                    progress_gen = gr.Label(
+                        label="Progress:"
+                    )
+                    tts_output_audio = gr.Audio(label="Generated Audio.")
+                    reference_audio = gr.Audio(label="Reference audio used.")
+
+            prompt_compute_btn.click(
+                fn=preprocess_dataset,
+                inputs=[
+                    upload_file,
+                    lang,
+                    out_path,
+                ],
+                outputs=[
+                    progress_data,
+                    train_csv,
+                    eval_csv,
+                ],
+            )
+
+
+            train_btn.click(
+                fn=train_model,
+                inputs=[
+                    lang,
+                    train_csv,
+                    eval_csv,
+                    num_epochs,
+                    batch_size,
+                    grad_acumm,
+                    out_path,
+                    max_audio_length,
+                ],
+                outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio],
+            )
+            
+            load_btn.click(
+                fn=load_model,
+                inputs=[
+                    xtts_checkpoint,
+                    xtts_config,
+                    xtts_vocab
+                ],
+                outputs=[progress_load],
+            )
+
+            tts_btn.click(
+                fn=run_tts,
+                inputs=[
+                    tts_language,
+                    tts_text,
+                    speaker_reference_audio,
+                ],
+                outputs=[progress_gen, tts_output_audio, reference_audio],
+            )
+
+    demo.launch(
+        share=True,
+        debug=False,
+        server_port=args.port,
+        server_name="0.0.0.0"
+    )
--- a/TTS/encoder/models/base_encoder.py
+++ b/TTS/encoder/models/base_encoder.py
@ -107,11 +107,18 @@ class BaseEncoder(nn.Module):
        return criterion

    def load_checkpoint(
-        self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
+        self,
+        config: Coqpit,
+        checkpoint_path: str,
+        eval: bool = False,
+        use_cuda: bool = False,
+        criterion=None,
+        cache=False,
    ):
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        try:
            self.load_state_dict(state["model"])
+            print(" > Model fully restored. ")
        except (KeyError, RuntimeError) as error:
            # If eval raise the error
            if eval:
--- a/TTS/encoder/models/resnet.py
+++ b/TTS/encoder/models/resnet.py
@ -161,16 +161,14 @@ class ResNetSpeakerEncoder(BaseEncoder):
        Shapes:
            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
        """
-        with torch.no_grad():
-            with torch.cuda.amp.autocast(enabled=False):
-                x.squeeze_(1)
-                # if you torch spec compute it otherwise use the mel spec computed by the AP
-                if self.use_torch_spec:
-                    x = self.torch_spec(x)
+        x.squeeze_(1)
+        # if you torch spec compute it otherwise use the mel spec computed by the AP
+        if self.use_torch_spec:
+            x = self.torch_spec(x)

-                if self.log_input:
-                    x = (x + 1e-6).log()
-                x = self.instancenorm(x).unsqueeze(1)
+        if self.log_input:
+            x = (x + 1e-6).log()
+        x = self.instancenorm(x).unsqueeze(1)

        x = self.conv1(x)
        x = self.relu(x)
--- a/TTS/encoder/utils/generic_utils.py
+++ b/TTS/encoder/utils/generic_utils.py
@ -1,20 +1,16 @@
-import datetime
 import glob
 import os
 import random
-import re

 import numpy as np
 from scipy import signal

 from TTS.encoder.models.lstm import LSTMSpeakerEncoder
 from TTS.encoder.models.resnet import ResNetSpeakerEncoder
-from TTS.utils.io import save_fsspec


 class AugmentWAV(object):
    def __init__(self, ap, augmentation_config):
-
        self.ap = ap
        self.use_additive_noise = False

@ -67,7 +63,6 @@ class AugmentWAV(object):
            self.global_noise_list.append("RIR_AUG")

    def additive_noise(self, noise_type, audio):
-
        clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)

        noise_list = random.sample(
@ -120,11 +115,6 @@ class AugmentWAV(object):
        return self.additive_noise(noise_type, audio)


-def to_camel(text):
-    text = text.capitalize()
-    return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
-
-
 def setup_encoder_model(config: "Coqpit"):
    if config.model_params["model_name"].lower() == "lstm":
        model = LSTMSpeakerEncoder(
@ -144,41 +134,3 @@ def setup_encoder_model(config: "Coqpit"):
            audio_config=config.audio,
        )
    return model
-
-
-def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
-    checkpoint_path = "checkpoint_{}.pth".format(current_step)
-    checkpoint_path = os.path.join(out_path, checkpoint_path)
-    print(" | | > Checkpoint saving : {}".format(checkpoint_path))
-
-    new_state_dict = model.state_dict()
-    state = {
-        "model": new_state_dict,
-        "optimizer": optimizer.state_dict() if optimizer is not None else None,
-        "criterion": criterion.state_dict(),
-        "step": current_step,
-        "epoch": epoch,
-        "loss": model_loss,
-        "date": datetime.date.today().strftime("%B %d, %Y"),
-    }
-    save_fsspec(state, checkpoint_path)
-
-
-def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
-    if model_loss < best_loss:
-        new_state_dict = model.state_dict()
-        state = {
-            "model": new_state_dict,
-            "optimizer": optimizer.state_dict(),
-            "criterion": criterion.state_dict(),
-            "step": current_step,
-            "epoch": epoch,
-            "loss": model_loss,
-            "date": datetime.date.today().strftime("%B %d, %Y"),
-        }
-        best_loss = model_loss
-        bestmodel_path = "best_model.pth"
-        bestmodel_path = os.path.join(out_path, bestmodel_path)
-        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
-        save_fsspec(state, bestmodel_path)
-    return best_loss
--- a/TTS/encoder/utils/io.py
+++ b/TTS/encoder/utils/io.py
@ -1,38 +0,0 @@
-import datetime
-import os
-
-from TTS.utils.io import save_fsspec
-
-
-def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
-    checkpoint_path = "checkpoint_{}.pth".format(current_step)
-    checkpoint_path = os.path.join(out_path, checkpoint_path)
-    print(" | | > Checkpoint saving : {}".format(checkpoint_path))
-
-    new_state_dict = model.state_dict()
-    state = {
-        "model": new_state_dict,
-        "optimizer": optimizer.state_dict() if optimizer is not None else None,
-        "step": current_step,
-        "loss": model_loss,
-        "date": datetime.date.today().strftime("%B %d, %Y"),
-    }
-    save_fsspec(state, checkpoint_path)
-
-
-def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
-    if model_loss < best_loss:
-        new_state_dict = model.state_dict()
-        state = {
-            "model": new_state_dict,
-            "optimizer": optimizer.state_dict(),
-            "step": current_step,
-            "loss": model_loss,
-            "date": datetime.date.today().strftime("%B %d, %Y"),
-        }
-        best_loss = model_loss
-        bestmodel_path = "best_model.pth"
-        bestmodel_path = os.path.join(out_path, bestmodel_path)
-        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
-        save_fsspec(state, bestmodel_path)
-    return best_loss
--- a/TTS/encoder/utils/training.py
+++ b/TTS/encoder/utils/training.py
@ -3,13 +3,13 @@ from dataclasses import dataclass, field

 from coqpit import Coqpit
 from trainer import TrainerArgs, get_last_checkpoint
+from trainer.io import copy_model_files
 from trainer.logging import logger_factory
 from trainer.logging.console_logger import ConsoleLogger

 from TTS.config import load_config, register_config
 from TTS.tts.utils.text.characters import parse_symbols
 from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
-from TTS.utils.io import copy_model_files


@dataclass
--- a/TTS/encoder/utils/visual.py
+++ b/TTS/encoder/utils/visual.py
@ -23,7 +23,7 @@ colormap = (
            [0, 0, 0],
            [183, 183, 183],
        ],
-        dtype=np.float,
+        dtype=float,
    )
    / 255
 )
--- a/TTS/model.py
+++ b/TTS/model.py
@ -44,13 +44,16 @@ class BaseTrainerModel(TrainerModel):
        return outputs_dict

    @abstractmethod
-    def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
+    def load_checkpoint(
+        self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
+    ) -> None:
        """Load a model checkpoint gile and get ready for training or inference.

        Args:
            config (Coqpit): Model configuration.
            checkpoint_path (str): Path to the model checkpoint file.
            eval (bool, optional): If true, init model for inference else for training. Defaults to False.
-            strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
+            strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
+            cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
        """
        ...
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@ -5,9 +5,11 @@ import json
 import os
 import sys
 from pathlib import Path
+from threading import Lock
 from typing import Union
+from urllib.parse import parse_qs

-from flask import Flask, render_template, request, send_file
+from flask import Flask, render_template, render_template_string, request, send_file

 from TTS.config import load_config
 from TTS.utils.manage import ModelManager
@ -114,8 +116,13 @@ synthesizer = Synthesizer(
 use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
    synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
 )
-
 speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
+
+use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and (
+    synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None
+)
+language_manager = getattr(synthesizer.tts_model, "language_manager", None)
+
 # TODO: set this from SpeakerManager
 use_gst = synthesizer.tts_config.get("use_gst", False)
 app = Flask(__name__)
@ -146,18 +153,28 @@ def index():
        "index.html",
        show_details=args.show_details,
        use_multi_speaker=use_multi_speaker,
-        speaker_ids=speaker_manager.ids if speaker_manager is not None else None,
+        use_multi_language=use_multi_language,
+        speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
+        language_ids=language_manager.name_to_id if language_manager is not None else None,
        use_gst=use_gst,
    )


@app.route("/details")
 def details():
-    model_config = load_config(args.tts_config)
-    if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
-        vocoder_config = load_config(args.vocoder_config)
+    if args.config_path is not None and os.path.isfile(args.config_path):
+        model_config = load_config(args.config_path)
    else:
-        vocoder_config = None
+        if args.model_name is not None:
+            model_config = load_config(config_path)
+
+    if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
+        vocoder_config = load_config(args.vocoder_config_path)
+    else:
+        if args.vocoder_name is not None:
+            vocoder_config = load_config(vocoder_config_path)
+        else:
+            vocoder_config = None

    return render_template(
        "details.html",
@ -168,17 +185,68 @@ def details():
    )


-@app.route("/api/tts", methods=["GET"])
+lock = Lock()
+
+
+@app.route("/api/tts", methods=["GET", "POST"])
 def tts():
-    text = request.args.get("text")
-    speaker_idx = request.args.get("speaker_id", "")
-    style_wav = request.args.get("style_wav", "")
-    style_wav = style_wav_uri_to_dict(style_wav)
-    print(" > Model input: {}".format(text))
-    print(" > Speaker Idx: {}".format(speaker_idx))
-    wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
-    out = io.BytesIO()
-    synthesizer.save_wav(wavs, out)
+    with lock:
+        text = request.headers.get("text") or request.values.get("text", "")
+        speaker_idx = request.headers.get("speaker-id") or request.values.get("speaker_id", "")
+        language_idx = request.headers.get("language-id") or request.values.get("language_id", "")
+        style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
+        style_wav = style_wav_uri_to_dict(style_wav)
+
+        print(f" > Model input: {text}")
+        print(f" > Speaker Idx: {speaker_idx}")
+        print(f" > Language Idx: {language_idx}")
+        wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
+        out = io.BytesIO()
+        synthesizer.save_wav(wavs, out)
+    return send_file(out, mimetype="audio/wav")
+
+
+# Basic MaryTTS compatibility layer
+
+
+@app.route("/locales", methods=["GET"])
+def mary_tts_api_locales():
+    """MaryTTS-compatible /locales endpoint"""
+    # NOTE: We currently assume there is only one model active at the same time
+    if args.model_name is not None:
+        model_details = args.model_name.split("/")
+    else:
+        model_details = ["", "en", "", "default"]
+    return render_template_string("{{ locale }}\n", locale=model_details[1])
+
+
+@app.route("/voices", methods=["GET"])
+def mary_tts_api_voices():
+    """MaryTTS-compatible /voices endpoint"""
+    # NOTE: We currently assume there is only one model active at the same time
+    if args.model_name is not None:
+        model_details = args.model_name.split("/")
+    else:
+        model_details = ["", "en", "", "default"]
+    return render_template_string(
+        "{{ name }} {{ locale }} {{ gender }}\n", name=model_details[3], locale=model_details[1], gender="u"
+    )
+
+
+@app.route("/process", methods=["GET", "POST"])
+def mary_tts_api_process():
+    """MaryTTS-compatible /process endpoint"""
+    with lock:
+        if request.method == "POST":
+            data = parse_qs(request.get_data(as_text=True))
+            # NOTE: we ignore param. LOCALE and VOICE for now since we have only one active model
+            text = data.get("INPUT_TEXT", [""])[0]
+        else:
+            text = request.args.get("INPUT_TEXT", "")
+        print(f" > Model input: {text}")
+        wavs = synthesizer.tts(text)
+        out = io.BytesIO()
+        synthesizer.save_wav(wavs, out)
    return send_file(out, mimetype="audio/wav")


--- a/TTS/server/templates/index.html
+++ b/TTS/server/templates/index.html
@ -65,7 +65,7 @@
                </ul>

                {%if use_gst%}
-                <input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path ot wav).." size=45
+                <input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path to wav).." size=45
                    type="text" name="style_wav">
                {%endif%}

@ -81,6 +81,16 @@
                </select><br /><br />
                {%endif%}

+                {%if use_multi_language%}
+                Choose a language:
+                <select id="language_id" name=language_id method="GET" action="/">
+                    {% for language_id in language_ids %}
+                    <option value="{{language_id}}" SELECTED>{{language_id}}</option>"
+                    {% endfor %}
+                </select><br /><br />
+                {%endif%}
+
+
                {%if show_details%}
                <button id="details-button" onclick="location.href = 'details'" name="model-details">Model
                    Details</button><br /><br />
@ -106,11 +116,12 @@
            const text = q('#text').value
            const speaker_id = getTextValue('#speaker_id')
            const style_wav = getTextValue('#style_wav')
+            const language_id = getTextValue('#language_id')
            if (text) {
                q('#message').textContent = 'Synthesizing...'
                q('#speak-button').disabled = true
                q('#audio').hidden = true
-                synthesize(text, speaker_id, style_wav)
+                synthesize(text, speaker_id, style_wav, language_id)
            }
            e.preventDefault()
            return false
@ -121,8 +132,8 @@
                do_tts(e)
            }
        })
-        function synthesize(text, speaker_id = "", style_wav = "") {
-            fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}`, { cache: 'no-cache' })
+        function synthesize(text, speaker_id = "", style_wav = "", language_id = "") {
+            fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}&language_id=${encodeURIComponent(language_id)}`, { cache: 'no-cache' })
                .then(function (res) {
                    if (!res.ok) throw Error(res.statusText)
                    return res.blob()
--- a/TTS/tts/configs/bark_config.py
+++ b/TTS/tts/configs/bark_config.py
@ -0,0 +1,105 @@
+import os
+from dataclasses import dataclass, field
+from typing import Dict
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.layers.bark.model import GPTConfig
+from TTS.tts.layers.bark.model_fine import FineGPTConfig
+from TTS.tts.models.bark import BarkAudioConfig
+from TTS.utils.generic_utils import get_user_data_dir
+
+
+@dataclass
+class BarkConfig(BaseTTSConfig):
+    """Bark TTS configuration
+
+    Args:
+        model (str): model name that registers the model.
+        audio (BarkAudioConfig): audio configuration. Defaults to BarkAudioConfig().
+        num_chars (int): number of characters in the alphabet. Defaults to 0.
+        semantic_config (GPTConfig): semantic configuration. Defaults to GPTConfig().
+        fine_config (FineGPTConfig): fine configuration. Defaults to FineGPTConfig().
+        coarse_config (GPTConfig): coarse configuration. Defaults to GPTConfig().
+        CONTEXT_WINDOW_SIZE (int): GPT context window size. Defaults to 1024.
+        SEMANTIC_RATE_HZ (float): semantic tokens rate in Hz. Defaults to 49.9.
+        SEMANTIC_VOCAB_SIZE (int): semantic vocabulary size. Defaults to 10_000.
+        CODEBOOK_SIZE (int): encodec codebook size. Defaults to 1024.
+        N_COARSE_CODEBOOKS (int): number of coarse codebooks. Defaults to 2.
+        N_FINE_CODEBOOKS (int): number of fine codebooks. Defaults to 8.
+        COARSE_RATE_HZ (int): coarse tokens rate in Hz. Defaults to 75.
+        SAMPLE_RATE (int): sample rate. Defaults to 24_000.
+        USE_SMALLER_MODELS (bool): use smaller models. Defaults to False.
+        TEXT_ENCODING_OFFSET (int): text encoding offset. Defaults to 10_048.
+        SEMANTIC_PAD_TOKEN (int): semantic pad token. Defaults to 10_000.
+        TEXT_PAD_TOKEN ([type]): text pad token. Defaults to 10_048.
+        TEXT_EOS_TOKEN ([type]): text end of sentence token. Defaults to 10_049.
+        TEXT_SOS_TOKEN ([type]): text start of sentence token. Defaults to 10_050.
+        SEMANTIC_INFER_TOKEN (int): semantic infer token. Defaults to 10_051.
+        COARSE_SEMANTIC_PAD_TOKEN (int): coarse semantic pad token. Defaults to 12_048.
+        COARSE_INFER_TOKEN (int): coarse infer token. Defaults to 12_050.
+        REMOTE_BASE_URL ([type]): remote base url. Defaults to "https://huggingface.co/erogol/bark/tree".
+        REMOTE_MODEL_PATHS (Dict): remote model paths. Defaults to None.
+        LOCAL_MODEL_PATHS (Dict): local model paths. Defaults to None.
+        SMALL_REMOTE_MODEL_PATHS (Dict): small remote model paths. Defaults to None.
+        CACHE_DIR (str): local cache directory. Defaults to get_user_data_dir().
+        DEF_SPEAKER_DIR (str): default speaker directory to stoke speaker values for voice cloning. Defaults to get_user_data_dir().
+    """
+
+    model: str = "bark"
+    audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
+    num_chars: int = 0
+    semantic_config: GPTConfig = field(default_factory=GPTConfig)
+    fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
+    coarse_config: GPTConfig = field(default_factory=GPTConfig)
+    CONTEXT_WINDOW_SIZE: int = 1024
+    SEMANTIC_RATE_HZ: float = 49.9
+    SEMANTIC_VOCAB_SIZE: int = 10_000
+    CODEBOOK_SIZE: int = 1024
+    N_COARSE_CODEBOOKS: int = 2
+    N_FINE_CODEBOOKS: int = 8
+    COARSE_RATE_HZ: int = 75
+    SAMPLE_RATE: int = 24_000
+    USE_SMALLER_MODELS: bool = False
+
+    TEXT_ENCODING_OFFSET: int = 10_048
+    SEMANTIC_PAD_TOKEN: int = 10_000
+    TEXT_PAD_TOKEN: int = 129_595
+    SEMANTIC_INFER_TOKEN: int = 129_599
+    COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
+    COARSE_INFER_TOKEN: int = 12_050
+
+    REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
+    REMOTE_MODEL_PATHS: Dict = None
+    LOCAL_MODEL_PATHS: Dict = None
+    SMALL_REMOTE_MODEL_PATHS: Dict = None
+    CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0"))
+    DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers"))
+
+    def __post_init__(self):
+        self.REMOTE_MODEL_PATHS = {
+            "text": {
+                "path": os.path.join(self.REMOTE_BASE_URL, "text_2.pt"),
+                "checksum": "54afa89d65e318d4f5f80e8e8799026a",
+            },
+            "coarse": {
+                "path": os.path.join(self.REMOTE_BASE_URL, "coarse_2.pt"),
+                "checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
+            },
+            "fine": {
+                "path": os.path.join(self.REMOTE_BASE_URL, "fine_2.pt"),
+                "checksum": "59d184ed44e3650774a2f0503a48a97b",
+            },
+        }
+        self.LOCAL_MODEL_PATHS = {
+            "text": os.path.join(self.CACHE_DIR, "text_2.pt"),
+            "coarse": os.path.join(self.CACHE_DIR, "coarse_2.pt"),
+            "fine": os.path.join(self.CACHE_DIR, "fine_2.pt"),
+            "hubert_tokenizer": os.path.join(self.CACHE_DIR, "tokenizer.pth"),
+            "hubert": os.path.join(self.CACHE_DIR, "hubert.pt"),
+        }
+        self.SMALL_REMOTE_MODEL_PATHS = {
+            "text": {"path": os.path.join(self.REMOTE_BASE_URL, "text.pt")},
+            "coarse": {"path": os.path.join(self.REMOTE_BASE_URL, "coarse.pt")},
+            "fine": {"path": os.path.join(self.REMOTE_BASE_URL, "fine.pt")},
+        }
+        self.sample_rate = self.SAMPLE_RATE  # pylint: disable=attribute-defined-outside-init
--- a/TTS/tts/configs/delightful_tts_config.py
+++ b/TTS/tts/configs/delightful_tts_config.py
@ -0,0 +1,170 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
+
+
+@dataclass
+class DelightfulTTSConfig(BaseTTSConfig):
+    """
+    Configuration class for the DelightfulTTS model.
+
+    Attributes:
+        model (str): Name of the model ("delightful_tts").
+        audio (DelightfulTtsAudioConfig): Configuration for audio settings.
+        model_args (DelightfulTtsArgs): Configuration for model arguments.
+        use_attn_priors (bool): Whether to use attention priors.
+        vocoder (VocoderConfig): Configuration for the vocoder.
+        init_discriminator (bool): Whether to initialize the discriminator.
+        steps_to_start_discriminator (int): Number of steps to start the discriminator.
+        grad_clip (List[float]): Gradient clipping values.
+        lr_gen (float): Learning rate for the  gan generator.
+        lr_disc (float): Learning rate for the gan discriminator.
+        lr_scheduler_gen (str): Name of the learning rate scheduler for the generator.
+        lr_scheduler_gen_params (dict): Parameters for the learning rate scheduler for the generator.
+        lr_scheduler_disc (str): Name of the learning rate scheduler for the discriminator.
+        lr_scheduler_disc_params (dict): Parameters for the learning rate scheduler for the discriminator.
+        scheduler_after_epoch (bool): Whether to schedule after each epoch.
+        optimizer (str): Name of the optimizer.
+        optimizer_params (dict): Parameters for the optimizer.
+        ssim_loss_alpha (float): Alpha value for the SSIM loss.
+        mel_loss_alpha (float): Alpha value for the mel loss.
+        aligner_loss_alpha (float): Alpha value for the aligner loss.
+        pitch_loss_alpha (float): Alpha value for the pitch loss.
+        energy_loss_alpha (float): Alpha value for the energy loss.
+        u_prosody_loss_alpha (float): Alpha value for the utterance prosody loss.
+        p_prosody_loss_alpha (float): Alpha value for the phoneme prosody loss.
+        dur_loss_alpha (float): Alpha value for the duration loss.
+        char_dur_loss_alpha (float): Alpha value for the character duration loss.
+        binary_align_loss_alpha (float): Alpha value for the binary alignment loss.
+        binary_loss_warmup_epochs (int): Number of warm-up epochs for the binary loss.
+        disc_loss_alpha (float): Alpha value for the discriminator loss.
+        gen_loss_alpha (float): Alpha value for the generator loss.
+        feat_loss_alpha (float): Alpha value for the feature loss.
+        vocoder_mel_loss_alpha (float): Alpha value for the vocoder mel loss.
+        multi_scale_stft_loss_alpha (float): Alpha value for the multi-scale STFT loss.
+        multi_scale_stft_loss_params (dict): Parameters for the multi-scale STFT loss.
+        return_wav (bool): Whether to return audio waveforms.
+        use_weighted_sampler (bool): Whether to use a weighted sampler.
+        weighted_sampler_attrs (dict): Attributes for the weighted sampler.
+        weighted_sampler_multipliers (dict): Multipliers for the weighted sampler.
+        r (int): Value for the `r` override.
+        compute_f0 (bool): Whether to compute F0 values.
+        f0_cache_path (str): Path to the F0 cache.
+        attn_prior_cache_path (str): Path to the attention prior cache.
+        num_speakers (int): Number of speakers.
+        use_speaker_embedding (bool): Whether to use speaker embedding.
+        speakers_file (str): Path to the speaker file.
+        speaker_embedding_channels (int): Number of channels for the speaker embedding.
+        language_ids_file (str): Path to the language IDs file.
+    """
+
+    model: str = "delightful_tts"
+
+    # model specific params
+    audio: DelightfulTtsAudioConfig = field(default_factory=DelightfulTtsAudioConfig)
+    model_args: DelightfulTtsArgs = field(default_factory=DelightfulTtsArgs)
+    use_attn_priors: bool = True
+
+    # vocoder
+    vocoder: VocoderConfig = field(default_factory=VocoderConfig)
+    init_discriminator: bool = True
+
+    # optimizer
+    steps_to_start_discriminator: int = 200000
+    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    lr_gen: float = 0.0002
+    lr_disc: float = 0.0002
+    lr_scheduler_gen: str = "ExponentialLR"
+    lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    lr_scheduler_disc: str = "ExponentialLR"
+    lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    scheduler_after_epoch: bool = True
+    optimizer: str = "AdamW"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
+
+    # acoustic model loss params
+    ssim_loss_alpha: float = 1.0
+    mel_loss_alpha: float = 1.0
+    aligner_loss_alpha: float = 1.0
+    pitch_loss_alpha: float = 1.0
+    energy_loss_alpha: float = 1.0
+    u_prosody_loss_alpha: float = 0.5
+    p_prosody_loss_alpha: float = 0.5
+    dur_loss_alpha: float = 1.0
+    char_dur_loss_alpha: float = 0.01
+    binary_align_loss_alpha: float = 0.1
+    binary_loss_warmup_epochs: int = 10
+
+    # vocoder loss params
+    disc_loss_alpha: float = 1.0
+    gen_loss_alpha: float = 1.0
+    feat_loss_alpha: float = 1.0
+    vocoder_mel_loss_alpha: float = 10.0
+    multi_scale_stft_loss_alpha: float = 2.5
+    multi_scale_stft_loss_params: dict = field(
+        default_factory=lambda: {
+            "n_ffts": [1024, 2048, 512],
+            "hop_lengths": [120, 240, 50],
+            "win_lengths": [600, 1200, 240],
+        }
+    )
+
+    # data loader params
+    return_wav: bool = True
+    use_weighted_sampler: bool = False
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+
+    # overrides
+    r: int = 1
+
+    # dataset configs
+    compute_f0: bool = True
+    f0_cache_path: str = None
+    attn_prior_cache_path: str = None
+
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    use_speaker_embedding: bool = False
+    speakers_file: str = None
+    speaker_embedding_channels: int = 256
+    language_ids_file: str = None
+    use_language_embedding: bool = False
+
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: str = None
+    d_vector_dim: int = None
+
+    # testing
+    test_sentences: List[List[str]] = field(
+        default_factory=lambda: [
+            ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
+            ["Be a voice, not an echo."],
+            ["I'm sorry Dave. I'm afraid I can't do that."],
+            ["This cake is great. It's so delicious and moist."],
+            ["Prior to November 22, 1963."],
+        ]
+    )
+
+    def __post_init__(self):
+        # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+        if self.num_speakers > 0:
+            self.model_args.num_speakers = self.num_speakers
+
+        # speaker embedding settings
+        if self.use_speaker_embedding:
+            self.model_args.use_speaker_embedding = True
+        if self.speakers_file:
+            self.model_args.speakers_file = self.speakers_file
+
+        # d-vector settings
+        if self.use_d_vector_file:
+            self.model_args.use_d_vector_file = True
+        if self.d_vector_dim is not None and self.d_vector_dim > 0:
+            self.model_args.d_vector_dim = self.d_vector_dim
+        if self.d_vector_file:
+            self.model_args.d_vector_file = self.d_vector_file
--- a/TTS/tts/configs/fast_pitch_config.py
+++ b/TTS/tts/configs/fast_pitch_config.py
@ -100,13 +100,20 @@ class FastPitchConfig(BaseTTSConfig):

        max_seq_len (int):
            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+
+        # dataset configs
+        compute_f0(bool):
+            Compute pitch. defaults to True
+
+        f0_cache_path(str):
+            pith cache path. defaults to None
    """

    model: str = "fast_pitch"
    base_model: str = "forward_tts"

    # model specific params
-    model_args: ForwardTTSArgs = ForwardTTSArgs()
+    model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)

    # multi-speaker settings
    num_speakers: int = 0
--- a/TTS/tts/configs/fast_speech_config.py
+++ b/TTS/tts/configs/fast_speech_config.py
@ -107,7 +107,7 @@ class FastSpeechConfig(BaseTTSConfig):
    base_model: str = "forward_tts"

    # model specific params
-    model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
+    model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))

    # multi-speaker settings
    num_speakers: int = 0
--- a/TTS/tts/configs/fastspeech2_config.py
+++ b/TTS/tts/configs/fastspeech2_config.py
@ -0,0 +1,198 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.forward_tts import ForwardTTSArgs
+
+
+@dataclass
+class Fastspeech2Config(BaseTTSConfig):
+    """Configure `ForwardTTS` as FastPitch model.
+
+    Example:
+
+        >>> from TTS.tts.configs.fastspeech2_config import FastSpeech2Config
+        >>> config = FastSpeech2Config()
+
+    Args:
+        model (str):
+            Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
+
+        base_model (str):
+            Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
+            the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
+
+        model_args (Coqpit):
+            Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
+
+        data_dep_init_steps (int):
+            Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
+            Activation Normalization that pre-computes normalization stats at the beginning and use the same values
+            for the rest. Defaults to 10.
+
+        speakers_file (str):
+            Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
+            speaker names. Defaults to `None`.
+
+        use_speaker_embedding (bool):
+            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+            in the multi-speaker mode. Defaults to False.
+
+        use_d_vector_file (bool):
+            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+
+        d_vector_file (str):
+            Path to the file including pre-computed speaker embeddings. Defaults to None.
+
+        d_vector_dim (int):
+            Dimension of the external speaker embeddings. Defaults to 0.
+
+        optimizer (str):
+            Name of the model optimizer. Defaults to `Adam`.
+
+        optimizer_params (dict):
+            Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
+
+        lr_scheduler (str):
+            Name of the learning rate scheduler. Defaults to `Noam`.
+
+        lr_scheduler_params (dict):
+            Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
+
+        lr (float):
+            Initial learning rate. Defaults to `1e-3`.
+
+        grad_clip (float):
+            Gradient norm clipping value. Defaults to `5.0`.
+
+        spec_loss_type (str):
+            Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+        duration_loss_type (str):
+            Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+        use_ssim_loss (bool):
+            Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
+
+        wd (float):
+            Weight decay coefficient. Defaults to `1e-7`.
+
+        ssim_loss_alpha (float):
+            Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
+
+        dur_loss_alpha (float):
+            Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
+
+        spec_loss_alpha (float):
+            Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
+
+        pitch_loss_alpha (float):
+            Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
+
+        energy_loss_alpha (float):
+            Weight for the energy predictor's loss. If set 0, disables the energy predictor. Defaults to 1.0.
+
+        binary_align_loss_alpha (float):
+            Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
+
+        binary_loss_warmup_epochs (float):
+            Number of epochs to gradually increase the binary loss impact. Defaults to 150.
+
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+
+        # dataset configs
+        compute_f0(bool):
+            Compute pitch. defaults to True
+
+        f0_cache_path(str):
+            pith cache path. defaults to None
+
+        # dataset configs
+        compute_energy(bool):
+            Compute energy. defaults to True
+
+        energy_cache_path(str):
+            energy cache path. defaults to None
+    """
+
+    model: str = "fastspeech2"
+    base_model: str = "forward_tts"
+
+    # model specific params
+    model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))
+
+    # multi-speaker settings
+    num_speakers: int = 0
+    speakers_file: str = None
+    use_speaker_embedding: bool = False
+    use_d_vector_file: bool = False
+    d_vector_file: str = False
+    d_vector_dim: int = 0
+
+    # optimizer parameters
+    optimizer: str = "Adam"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
+    lr_scheduler: str = "NoamLR"
+    lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
+    lr: float = 1e-4
+    grad_clip: float = 5.0
+
+    # loss params
+    spec_loss_type: str = "mse"
+    duration_loss_type: str = "mse"
+    use_ssim_loss: bool = True
+    ssim_loss_alpha: float = 1.0
+    spec_loss_alpha: float = 1.0
+    aligner_loss_alpha: float = 1.0
+    pitch_loss_alpha: float = 0.1
+    energy_loss_alpha: float = 0.1
+    dur_loss_alpha: float = 0.1
+    binary_align_loss_alpha: float = 0.1
+    binary_loss_warmup_epochs: int = 150
+
+    # overrides
+    min_seq_len: int = 13
+    max_seq_len: int = 200
+    r: int = 1  # DO NOT CHANGE
+
+    # dataset configs
+    compute_f0: bool = True
+    f0_cache_path: str = None
+
+    # dataset configs
+    compute_energy: bool = True
+    energy_cache_path: str = None
+
+    # testing
+    test_sentences: List[str] = field(
+        default_factory=lambda: [
+            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+            "Be a voice, not an echo.",
+            "I'm sorry Dave. I'm afraid I can't do that.",
+            "This cake is great. It's so delicious and moist.",
+            "Prior to November 22, 1963.",
+        ]
+    )
+
+    def __post_init__(self):
+        # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+        if self.num_speakers > 0:
+            self.model_args.num_speakers = self.num_speakers
+
+        # speaker embedding settings
+        if self.use_speaker_embedding:
+            self.model_args.use_speaker_embedding = True
+        if self.speakers_file:
+            self.model_args.speakers_file = self.speakers_file
+
+        # d-vector settings
+        if self.use_d_vector_file:
+            self.model_args.use_d_vector_file = True
+        if self.d_vector_dim is not None and self.d_vector_dim > 0:
+            self.model_args.d_vector_dim = self.d_vector_dim
+        if self.d_vector_file:
+            self.model_args.d_vector_file = self.d_vector_file
--- a/TTS/tts/configs/neuralhmm_tts_config.py
+++ b/TTS/tts/configs/neuralhmm_tts_config.py
@ -0,0 +1,170 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+
+
+@dataclass
+class NeuralhmmTTSConfig(BaseTTSConfig):
+    """
+    Define parameters for Neural HMM TTS model.
+
+    Example:
+
+        >>> from TTS.tts.configs.overflow_config import OverflowConfig
+        >>> config = OverflowConfig()
+
+    Args:
+        model (str):
+            Model name used to select the right model class to initilize. Defaults to `Overflow`.
+        run_eval_steps (int):
+            Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
+        save_step (int):
+            Save local checkpoint every save_step steps. Defaults to 500.
+        plot_step (int):
+            Plot training stats on the logger every plot_step steps. Defaults to 1.
+        model_param_stats (bool):
+            Log model parameters stats on the logger dashboard. Defaults to False.
+        force_generate_statistics (bool):
+            Force generate mel normalization statistics. Defaults to False.
+        mel_statistics_parameter_path (str):
+            Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
+            Defaults to None.
+        num_chars (int):
+            Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
+        state_per_phone (int):
+            Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
+        encoder_in_out_features (int):
+            Channels of encoder input and character embedding tensors. Defaults to 512.
+        encoder_n_convolutions (int):
+            Number of convolution layers in the encoder. Defaults to 3.
+        out_channels (int):
+            Channels of the final model output. It must match the spectragram size. Defaults to 80.
+        ar_order (int):
+            Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
+        sampling_temp (float):
+            Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
+        deterministic_transition (bool):
+            deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
+        duration_threshold (float):
+            Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
+        use_grad_checkpointing (bool):
+            Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
+        max_sampling_time (int):
+            Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
+        prenet_type (str):
+            `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
+            Prenet. Defaults to `original`.
+        prenet_dim (int):
+            Dimension of the Prenet. Defaults to 256.
+        prenet_n_layers (int):
+            Number of layers in the Prenet. Defaults to 2.
+        prenet_dropout (float):
+            Dropout rate of the Prenet. Defaults to 0.5.
+        prenet_dropout_at_inference (bool):
+            Use dropout at inference time. Defaults to False.
+        memory_rnn_dim (int):
+            Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
+        outputnet_size (list[int]):
+            Size of the output network inside the neural HMM. Defaults to [1024].
+        flat_start_params (dict):
+            Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
+            It will be recomputed when you pass the dataset.
+        std_floor (float):
+            Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
+            It is called `variance flooring` in standard HMM literature.
+        optimizer (str):
+            Optimizer to use for training. Defaults to `adam`.
+        optimizer_params (dict):
+            Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
+        grad_clip (float):
+            Gradient clipping threshold. Defaults to 40_000.
+        lr (float):
+            Learning rate. Defaults to 1e-3.
+        lr_scheduler (str):
+            Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
+            `TTS.utils.training`. Defaults to `None`.
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+    """
+
+    model: str = "NeuralHMM_TTS"
+
+    # Training and Checkpoint configs
+    run_eval_steps: int = 100
+    save_step: int = 500
+    plot_step: int = 1
+    model_param_stats: bool = False
+
+    # data parameters
+    force_generate_statistics: bool = False
+    mel_statistics_parameter_path: str = None
+
+    # Encoder parameters
+    num_chars: int = None
+    state_per_phone: int = 2
+    encoder_in_out_features: int = 512
+    encoder_n_convolutions: int = 3
+
+    # HMM parameters
+    out_channels: int = 80
+    ar_order: int = 1
+    sampling_temp: float = 0
+    deterministic_transition: bool = True
+    duration_threshold: float = 0.43
+    use_grad_checkpointing: bool = True
+    max_sampling_time: int = 1000
+
+    ## Prenet parameters
+    prenet_type: str = "original"
+    prenet_dim: int = 256
+    prenet_n_layers: int = 2
+    prenet_dropout: float = 0.5
+    prenet_dropout_at_inference: bool = True
+    memory_rnn_dim: int = 1024
+
+    ## Outputnet parameters
+    outputnet_size: List[int] = field(default_factory=lambda: [1024])
+    flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
+    std_floor: float = 0.001
+
+    # optimizer parameters
+    optimizer: str = "Adam"
+    optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
+    grad_clip: float = 40000.0
+    lr: float = 1e-3
+    lr_scheduler: str = None
+
+    # overrides
+    min_text_len: int = 10
+    max_text_len: int = 500
+    min_audio_len: int = 512
+
+    # testing
+    test_sentences: List[str] = field(
+        default_factory=lambda: [
+            "Be a voice, not an echo.",
+        ]
+    )
+
+    # Extra needed config
+    r: int = 1
+    use_d_vector_file: bool = False
+    use_speaker_embedding: bool = False
+
+    def check_values(self):
+        """Validate the hyperparameters.
+
+        Raises:
+            AssertionError: when the parameters network is not defined
+            AssertionError: transition probability is not between 0 and 1
+        """
+        assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
+        assert (
+            len(self.outputnet_size) >= 1
+        ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
+        assert (
+            0 < self.flat_start_params["transition_p"] < 1
+        ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
--- a/TTS/tts/configs/overflow_config.py
+++ b/TTS/tts/configs/overflow_config.py
@ -0,0 +1,201 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+
+
+@dataclass
+class OverflowConfig(BaseTTSConfig):  # The classname has to be camel case
+    """
+    Define parameters for OverFlow model.
+
+    Example:
+
+        >>> from TTS.tts.configs.overflow_config import OverflowConfig
+        >>> config = OverflowConfig()
+
+    Args:
+        model (str):
+            Model name used to select the right model class to initilize. Defaults to `Overflow`.
+        run_eval_steps (int):
+            Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
+        save_step (int):
+            Save local checkpoint every save_step steps. Defaults to 500.
+        plot_step (int):
+            Plot training stats on the logger every plot_step steps. Defaults to 1.
+        model_param_stats (bool):
+            Log model parameters stats on the logger dashboard. Defaults to False.
+        force_generate_statistics (bool):
+            Force generate mel normalization statistics. Defaults to False.
+        mel_statistics_parameter_path (str):
+            Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
+            Defaults to None.
+        num_chars (int):
+            Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
+        state_per_phone (int):
+            Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
+        encoder_in_out_features (int):
+            Channels of encoder input and character embedding tensors. Defaults to 512.
+        encoder_n_convolutions (int):
+            Number of convolution layers in the encoder. Defaults to 3.
+        out_channels (int):
+            Channels of the final model output. It must match the spectragram size. Defaults to 80.
+        ar_order (int):
+            Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
+        sampling_temp (float):
+            Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
+        deterministic_transition (bool):
+            deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
+        duration_threshold (float):
+            Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
+        use_grad_checkpointing (bool):
+            Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
+        max_sampling_time (int):
+            Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
+        prenet_type (str):
+            `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
+            Prenet. Defaults to `original`.
+        prenet_dim (int):
+            Dimension of the Prenet. Defaults to 256.
+        prenet_n_layers (int):
+            Number of layers in the Prenet. Defaults to 2.
+        prenet_dropout (float):
+            Dropout rate of the Prenet. Defaults to 0.5.
+        prenet_dropout_at_inference (bool):
+            Use dropout at inference time. Defaults to False.
+        memory_rnn_dim (int):
+            Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
+        outputnet_size (list[int]):
+            Size of the output network inside the neural HMM. Defaults to [1024].
+        flat_start_params (dict):
+            Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
+            It will be recomputed when you pass the dataset.
+        std_floor (float):
+            Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
+            It is called `variance flooring` in standard HMM literature.
+        hidden_channels_dec (int):
+            Number of base hidden channels used by the decoder WaveNet network. Defaults to 150.
+        kernel_size_dec (int):
+            Decoder kernel size. Defaults to 5
+        dilation_rate (int):
+            Rate to increase dilation by each layer in a decoder block. Defaults to 1.
+        num_flow_blocks_dec (int):
+            Number of decoder layers in each decoder block.  Defaults to 4.
+        dropout_p_dec (float):
+            Dropout rate of the decoder. Defaults to 0.05.
+        num_splits (int):
+            Number of split levels in inversible conv1x1 operation. Defaults to 4.
+        num_squeeze (int):
+            Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
+            'num_squeeze'. Defaults to 2.
+        sigmoid_scale (bool):
+            enable/disable sigmoid scaling in decoder. Defaults to False.
+        c_in_channels (int):
+            Unused parameter from GlowTTS's decoder. Defaults to 0.
+        optimizer (str):
+            Optimizer to use for training. Defaults to `adam`.
+        optimizer_params (dict):
+            Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
+        grad_clip (float):
+            Gradient clipping threshold. Defaults to 40_000.
+        lr (float):
+            Learning rate. Defaults to 1e-3.
+        lr_scheduler (str):
+            Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
+            `TTS.utils.training`. Defaults to `None`.
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+    """
+
+    model: str = "Overflow"
+
+    # Training and Checkpoint configs
+    run_eval_steps: int = 100
+    save_step: int = 500
+    plot_step: int = 1
+    model_param_stats: bool = False
+
+    # data parameters
+    force_generate_statistics: bool = False
+    mel_statistics_parameter_path: str = None
+
+    # Encoder parameters
+    num_chars: int = None
+    state_per_phone: int = 2
+    encoder_in_out_features: int = 512
+    encoder_n_convolutions: int = 3
+
+    # HMM parameters
+    out_channels: int = 80
+    ar_order: int = 1
+    sampling_temp: float = 0.334
+    deterministic_transition: bool = True
+    duration_threshold: float = 0.55
+    use_grad_checkpointing: bool = True
+    max_sampling_time: int = 1000
+
+    ## Prenet parameters
+    prenet_type: str = "original"
+    prenet_dim: int = 256
+    prenet_n_layers: int = 2
+    prenet_dropout: float = 0.5
+    prenet_dropout_at_inference: bool = False
+    memory_rnn_dim: int = 1024
+
+    ## Outputnet parameters
+    outputnet_size: List[int] = field(default_factory=lambda: [1024])
+    flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
+    std_floor: float = 0.01
+
+    # Decoder parameters
+    hidden_channels_dec: int = 150
+    kernel_size_dec: int = 5
+    dilation_rate: int = 1
+    num_flow_blocks_dec: int = 12
+    num_block_layers: int = 4
+    dropout_p_dec: float = 0.05
+    num_splits: int = 4
+    num_squeeze: int = 2
+    sigmoid_scale: bool = False
+    c_in_channels: int = 0
+
+    # optimizer parameters
+    optimizer: str = "Adam"
+    optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
+    grad_clip: float = 40000.0
+    lr: float = 1e-3
+    lr_scheduler: str = None
+
+    # overrides
+    min_text_len: int = 10
+    max_text_len: int = 500
+    min_audio_len: int = 512
+
+    # testing
+    test_sentences: List[str] = field(
+        default_factory=lambda: [
+            "Be a voice, not an echo.",
+        ]
+    )
+
+    # Extra needed config
+    r: int = 1
+    use_d_vector_file: bool = False
+    use_speaker_embedding: bool = False
+
+    def check_values(self):
+        """Validate the hyperparameters.
+
+        Raises:
+            AssertionError: when the parameters network is not defined
+            AssertionError: transition probability is not between 0 and 1
+        """
+        assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
+        assert (
+            len(self.outputnet_size) >= 1
+        ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
+        assert (
+            0 < self.flat_start_params["transition_p"] < 1
+        ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -217,6 +217,9 @@ class BaseTTSConfig(BaseTrainingConfig):
        compute_f0 (int):
            (Not in use yet).

+        compute_energy (int):
+            (Not in use yet).
+
        compute_linear_spec (bool):
            If True data loader computes and returns linear spectrograms alongside the other data.

@ -230,6 +233,13 @@ class BaseTTSConfig(BaseTrainingConfig):
            If True, the data loader will start loading the longest batch first. It is useful for checking OOM issues.
            Defaults to False.

+        shuffle (bool):
+            If True, the data loader will shuffle the dataset when there is not sampler defined. Defaults to True.
+
+        drop_last (bool):
+            If True, the data loader will drop the last batch if it is not complete. It helps to prevent
+            issues that emerge from the partial batch statistics. Defaults to True.
+
        add_blank (bool):
            Add blank characters between each other two characters. It improves performance for some models at expense
            of slower run-time due to the longer input sequence.
@ -305,17 +315,20 @@ class BaseTTSConfig(BaseTrainingConfig):
    min_text_len: int = 1
    max_text_len: int = float("inf")
    compute_f0: bool = False
+    compute_energy: bool = False
    compute_linear_spec: bool = False
    precompute_num_workers: int = 0
    use_noise_augment: bool = False
    start_by_longest: bool = False
+    shuffle: bool = False
+    drop_last: bool = False
    # dataset
    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
    # optimizer
    optimizer: str = "radam"
    optimizer_params: dict = None
    # scheduler
-    lr_scheduler: str = ""
+    lr_scheduler: str = None
    lr_scheduler_params: dict = field(default_factory=lambda: {})
    # testing
    test_sentences: List[str] = field(default_factory=lambda: [])
--- a/TTS/tts/configs/speedy_speech_config.py
+++ b/TTS/tts/configs/speedy_speech_config.py
@ -103,26 +103,28 @@ class SpeedySpeechConfig(BaseTTSConfig):
    base_model: str = "forward_tts"

    # set model args as SpeedySpeech
-    model_args: ForwardTTSArgs = ForwardTTSArgs(
-        use_pitch=False,
-        encoder_type="residual_conv_bn",
-        encoder_params={
-            "kernel_size": 4,
-            "dilations": 4 * [1, 2, 4] + [1],
-            "num_conv_blocks": 2,
-            "num_res_blocks": 13,
-        },
-        decoder_type="residual_conv_bn",
-        decoder_params={
-            "kernel_size": 4,
-            "dilations": 4 * [1, 2, 4, 8] + [1],
-            "num_conv_blocks": 2,
-            "num_res_blocks": 17,
-        },
-        out_channels=80,
-        hidden_channels=128,
-        positional_encoding=True,
-        detach_duration_predictor=True,
+    model_args: ForwardTTSArgs = field(
+        default_factory=lambda: ForwardTTSArgs(
+            use_pitch=False,
+            encoder_type="residual_conv_bn",
+            encoder_params={
+                "kernel_size": 4,
+                "dilations": 4 * [1, 2, 4] + [1],
+                "num_conv_blocks": 2,
+                "num_res_blocks": 13,
+            },
+            decoder_type="residual_conv_bn",
+            decoder_params={
+                "kernel_size": 4,
+                "dilations": 4 * [1, 2, 4, 8] + [1],
+                "num_conv_blocks": 2,
+                "num_res_blocks": 17,
+            },
+            out_channels=80,
+            hidden_channels=128,
+            positional_encoding=True,
+            detach_duration_predictor=True,
+        )
    )

    # multi-speaker settings
--- a/TTS/tts/configs/tortoise_config.py
+++ b/TTS/tts/configs/tortoise_config.py
@ -0,0 +1,87 @@
+from dataclasses import dataclass, field
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.tortoise import TortoiseArgs, TortoiseAudioConfig
+
+
+@dataclass
+class TortoiseConfig(BaseTTSConfig):
+    """Defines parameters for Tortoise TTS model.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (TortoiseArgs):
+            Model architecture arguments. Defaults to `TortoiseArgs()`.
+
+        audio (TortoiseAudioConfig):
+            Audio processing configuration. Defaults to `TortoiseAudioConfig()`.
+
+        model_dir (str):
+            Path to the folder that has all the Tortoise models. Defaults to None.
+
+        temperature (float):
+            Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
+
+        length_penalty (float):
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
+            which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
+            length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
+
+        reperation_penalty (float):
+            The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
+
+        top_p (float):
+            If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+            Defaults to `0.8`.
+
+        cond_free_k (float):
+            Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
+            As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
+            Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
+
+        diffusion_temperature (float):
+            Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
+            are the "mean" prediction of the diffusion network and will sound bland and smeared.
+            Defaults to `1.0`.
+
+        num_autoregressive_samples (int):
+            Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
+            As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
+            Defaults to `16`.
+
+        diffusion_iterations (int):
+            Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
+            the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
+            however. Defaults to `30`.
+
+        sampler (str):
+            Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
+    Note:
+        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
+
+    Example:
+
+        >>> from TTS.tts.configs.tortoise_config import TortoiseConfig
+        >>> config = TortoiseConfig()
+    """
+
+    model: str = "tortoise"
+    # model specific params
+    model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
+    audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
+    model_dir: str = None
+
+    # settings
+    temperature: float = 0.2
+    length_penalty: float = 1.0
+    repetition_penalty: float = 2.0
+    top_p: float = 0.8
+    cond_free_k: float = 2.0
+    diffusion_temperature: float = 1.0
+
+    # inference params
+    num_autoregressive_samples: int = 16
+    diffusion_iterations: int = 30
+    sampler: str = "ddim"
--- a/TTS/tts/configs/vits_config.py
+++ b/TTS/tts/configs/vits_config.py
@ -109,7 +109,7 @@ class VitsConfig(BaseTTSConfig):
    model: str = "vits"
    # model specific params
    model_args: VitsArgs = field(default_factory=VitsArgs)
-    audio: VitsAudioConfig = VitsAudioConfig()
+    audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)

    # optimizer
    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
@ -167,7 +167,7 @@ class VitsConfig(BaseTTSConfig):

    # use d-vectors
    use_d_vector_file: bool = False
-    d_vector_file: str = None
+    d_vector_file: List[str] = None
    d_vector_dim: int = None

    def __post_init__(self):
--- a/TTS/tts/configs/xtts_config.py
+++ b/TTS/tts/configs/xtts_config.py
@ -0,0 +1,107 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
+
+
+@dataclass
+class XttsConfig(BaseTTSConfig):
+    """Defines parameters for XTTS TTS model.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (XttsArgs):
+            Model architecture arguments. Defaults to `XttsArgs()`.
+
+        audio (XttsAudioConfig):
+            Audio processing configuration. Defaults to `XttsAudioConfig()`.
+
+        model_dir (str):
+            Path to the folder that has all the XTTS models. Defaults to None.
+
+        temperature (float):
+            Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
+
+        length_penalty (float):
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
+            which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
+            length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
+
+        repetition_penalty (float):
+            The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
+
+        top_p (float):
+            If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+            Defaults to `0.8`.
+
+        num_gpt_outputs (int):
+            Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
+            As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
+            Defaults to `16`.
+
+        gpt_cond_len (int):
+            Secs audio to be used as conditioning for the autoregressive model. Defaults to `12`.
+
+        gpt_cond_chunk_len (int):
+            Audio chunk size in secs. Audio is split into chunks and latents are extracted for each chunk. Then the
+            latents are averaged. Chunking improves the stability. It must be <= gpt_cond_len.
+            If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to `4`.
+
+        max_ref_len (int):
+            Maximum number of seconds of audio to be used as conditioning for the decoder. Defaults to `10`.
+
+        sound_norm_refs (bool):
+            Whether to normalize the conditioning audio. Defaults to `False`.
+
+    Note:
+        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
+
+    Example:
+
+        >>> from TTS.tts.configs.xtts_config import XttsConfig
+        >>> config = XttsConfig()
+    """
+
+    model: str = "xtts"
+    # model specific params
+    model_args: XttsArgs = field(default_factory=XttsArgs)
+    audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
+    model_dir: str = None
+    languages: List[str] = field(
+        default_factory=lambda: [
+            "en",
+            "es",
+            "fr",
+            "de",
+            "it",
+            "pt",
+            "pl",
+            "tr",
+            "ru",
+            "nl",
+            "cs",
+            "ar",
+            "zh-cn",
+            "hu",
+            "ko",
+            "ja",
+            "hi",
+        ]
+    )
+
+    # inference params
+    temperature: float = 0.85
+    length_penalty: float = 1.0
+    repetition_penalty: float = 2.0
+    top_k: int = 50
+    top_p: float = 0.85
+    num_gpt_outputs: int = 1
+
+    # cloning
+    gpt_cond_len: int = 12
+    gpt_cond_chunk_len: int = 4
+    max_ref_len: int = 10
+    sound_norm_refs: bool = False
--- a/TTS/tts/datasets/init.py
+++ b/TTS/tts/datasets/init.py
@ -1,3 +1,4 @@
+import os
 import sys
 from collections import Counter
 from pathlib import Path
@ -12,20 +13,16 @@ from TTS.tts.datasets.formatters import *
 def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
    """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.

-        Args:
-    <<<<<<< HEAD
-            items (List[List]):
-                A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
+    Args:
+        items (List[List]):
+            A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.

-            eval_split_max_size (int):
-                Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+        eval_split_max_size (int):
+            Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).

-            eval_split_size (float):
-                If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
-                If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
-    =======
-            items (List[List]): A list of samples. Each sample is a list of `[text, audio_path, speaker_id]`.
-    >>>>>>> Fix docstring
+        eval_split_size (float):
+            If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+            If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
    """
    speakers = [item["speaker_name"] for item in items]
    is_multi_speaker = len(set(speakers)) > 1
@ -59,6 +56,17 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
    return items[:eval_split_size], items[eval_split_size:]


+def add_extra_keys(metadata, language, dataset_name):
+    for item in metadata:
+        # add language name
+        item["language"] = language
+        # add unique audio name
+        relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
+        audio_unique_name = f"{dataset_name}#{relfilepath}"
+        item["audio_unique_name"] = audio_unique_name
+    return metadata
+
+
 def load_tts_samples(
    datasets: Union[List[Dict], Dict],
    eval_split=True,
@ -97,7 +105,8 @@ def load_tts_samples(
    if not isinstance(datasets, list):
        datasets = [datasets]
    for dataset in datasets:
-        name = dataset["name"]
+        formatter_name = dataset["formatter"]
+        dataset_name = dataset["dataset_name"]
        root_path = dataset["path"]
        meta_file_train = dataset["meta_file_train"]
        meta_file_val = dataset["meta_file_val"]
@ -106,19 +115,22 @@ def load_tts_samples(

        # setup the right data processor
        if formatter is None:
-            formatter = _get_formatter_by_name(name)
+            formatter = _get_formatter_by_name(formatter_name)
        # load train set
        meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers)
-        meta_data_train = [{**item, **{"language": language}} for item in meta_data_train]
+        assert len(meta_data_train) > 0, f" [!] No training samples found in {root_path}/{meta_file_train}"
+
+        meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)

        print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
        # load evaluation split if set
        if eval_split:
            if meta_file_val:
                meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
-                meta_data_eval = [{**item, **{"language": language}} for item in meta_data_eval]
+                meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
            else:
-                meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
+                eval_size_per_dataset = eval_split_max_size // len(datasets) if eval_split_max_size else None
+                meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_size_per_dataset, eval_split_size)
            meta_data_eval_all += meta_data_eval
        meta_data_train_all += meta_data_train
        # load attention masks for the duration predictor training
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@ -1,3 +1,4 @@
+import base64
 import collections
 import os
 import random
@ -10,6 +11,9 @@ from torch.utils.data import Dataset

 from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
 from TTS.utils.audio import AudioProcessor
+from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
+
+import mutagen

 # to prevent too many open files error as suggested here
 # https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
@ -34,6 +38,21 @@ def noise_augment_audio(wav):
    return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)


+def string2filename(string):
+    # generate a safe and reversible filename based on a string
+    filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
+    return filename
+
+
+def get_audio_size(audiopath):
+    extension = audiopath.rpartition(".")[-1].lower()
+    if extension not in {"mp3", "wav", "flac"}:
+        raise RuntimeError(f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!")
+
+    audio_info = mutagen.File(audiopath).info
+    return int(audio_info.length * audio_info.sample_rate)
+
+
 class TTSDataset(Dataset):
    def __init__(
        self,
@ -43,7 +62,9 @@ class TTSDataset(Dataset):
        samples: List[Dict] = None,
        tokenizer: "TTSTokenizer" = None,
        compute_f0: bool = False,
+        compute_energy: bool = False,
        f0_cache_path: str = None,
+        energy_cache_path: str = None,
        return_wav: bool = False,
        batch_group_size: int = 0,
        min_text_len: int = 0,
@ -77,8 +98,12 @@ class TTSDataset(Dataset):

            compute_f0 (bool): compute f0 if True. Defaults to False.

+            compute_energy (bool): compute energy if True. Defaults to False.
+
            f0_cache_path (str): Path to store f0 cache. Defaults to None.

+            energy_cache_path (str): Path to store energy cache. Defaults to None.
+
            return_wav (bool): Return the waveform of the sample. Defaults to False.

            batch_group_size (int): Range of batch randomization after sorting
@ -121,7 +146,9 @@ class TTSDataset(Dataset):
        self.compute_linear_spec = compute_linear_spec
        self.return_wav = return_wav
        self.compute_f0 = compute_f0
+        self.compute_energy = compute_energy
        self.f0_cache_path = f0_cache_path
+        self.energy_cache_path = energy_cache_path
        self.min_audio_len = min_audio_len
        self.max_audio_len = max_audio_len
        self.min_text_len = min_text_len
@ -148,7 +175,10 @@ class TTSDataset(Dataset):
            self.f0_dataset = F0Dataset(
                self.samples, self.ap, cache_path=f0_cache_path, precompute_num_workers=precompute_num_workers
            )
-
+        if compute_energy:
+            self.energy_dataset = EnergyDataset(
+                self.samples, self.ap, cache_path=energy_cache_path, precompute_num_workers=precompute_num_workers
+            )
        if self.verbose:
            self.print_logs()

@ -157,7 +187,7 @@ class TTSDataset(Dataset):
        lens = []
        for item in self.samples:
            _, wav_file, *_ = _parse_sample(item)
-            audio_len = os.path.getsize(wav_file) / 16 * 8  # assuming 16bit audio
+            audio_len = get_audio_size(wav_file)
            lens.append(audio_len)
        return lens

@ -170,6 +200,8 @@ class TTSDataset(Dataset):
        self._samples = new_samples
        if hasattr(self, "f0_dataset"):
            self.f0_dataset.samples = new_samples
+        if hasattr(self, "energy_dataset"):
+            self.energy_dataset.samples = new_samples
        if hasattr(self, "phoneme_dataset"):
            self.phoneme_dataset.samples = new_samples

@ -201,7 +233,13 @@ class TTSDataset(Dataset):
    def get_f0(self, idx):
        out_dict = self.f0_dataset[idx]
        item = self.samples[idx]
-        assert item["audio_file"] == out_dict["audio_file"]
+        assert item["audio_unique_name"] == out_dict["audio_unique_name"]
+        return out_dict
+
+    def get_energy(self, idx):
+        out_dict = self.energy_dataset[idx]
+        item = self.samples[idx]
+        assert item["audio_unique_name"] == out_dict["audio_unique_name"]
        return out_dict

    @staticmethod
@ -245,17 +283,22 @@ class TTSDataset(Dataset):
        f0 = None
        if self.compute_f0:
            f0 = self.get_f0(idx)["f0"]
+        energy = None
+        if self.compute_energy:
+            energy = self.get_energy(idx)["energy"]

        sample = {
            "raw_text": raw_text,
            "token_ids": token_ids,
            "wav": wav,
            "pitch": f0,
+            "energy": energy,
            "attn": attn,
            "item_idx": item["audio_file"],
            "speaker_name": item["speaker_name"],
            "language_name": item["language"],
            "wav_file_name": os.path.basename(item["audio_file"]),
+            "audio_unique_name": item["audio_unique_name"],
        }
        return sample

@ -263,7 +306,7 @@ class TTSDataset(Dataset):
    def _compute_lengths(samples):
        new_samples = []
        for item in samples:
-            audio_length = os.path.getsize(item["audio_file"]) / 16 * 8  # assuming 16bit audio
+            audio_length = get_audio_size(item["audio_file"])
            text_lenght = len(item["text"])
            item["audio_length"] = audio_length
            item["text_length"] = text_lenght
@ -381,7 +424,6 @@ class TTSDataset(Dataset):

        # Puts each data field into a tensor with outer dimension batch size
        if isinstance(batch[0], collections.abc.Mapping):
-
            token_ids_lengths = np.array([len(d["token_ids"]) for d in batch])

            # sort items with text input length for RNN efficiency
@ -397,8 +439,8 @@ class TTSDataset(Dataset):
                language_ids = None
            # get pre-computed d-vectors
            if self.d_vector_mapping is not None:
-                wav_files_names = list(batch["wav_file_name"])
-                d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names]
+                embedding_keys = list(batch["audio_unique_name"])
+                d_vectors = [self.d_vector_mapping[w]["embedding"] for w in embedding_keys]
            else:
                d_vectors = None

@ -482,7 +524,13 @@ class TTSDataset(Dataset):
                pitch = torch.FloatTensor(pitch)[:, None, :].contiguous()  # B x 1 xT
            else:
                pitch = None
-
+            # format energy
+            if self.compute_energy:
+                energy = prepare_data(batch["energy"])
+                assert mel.shape[1] == energy.shape[1], f"[!] {mel.shape} vs {energy.shape}"
+                energy = torch.FloatTensor(energy)[:, None, :].contiguous()  # B x 1 xT
+            else:
+                energy = None
            # format attention masks
            attns = None
            if batch["attn"][0] is not None:
@ -511,7 +559,9 @@ class TTSDataset(Dataset):
                "waveform": wav_padded,
                "raw_text": batch["raw_text"],
                "pitch": pitch,
+                "energy": energy,
                "language_ids": language_ids,
+                "audio_unique_names": batch["audio_unique_name"],
            }

        raise TypeError(
@ -560,25 +610,24 @@ class PhonemeDataset(Dataset):

    def __getitem__(self, index):
        item = self.samples[index]
-        ids = self.compute_or_load(item["audio_file"], item["text"])
+        ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"], item["language"])
        ph_hat = self.tokenizer.ids_to_text(ids)
        return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}

    def __len__(self):
        return len(self.samples)

-    def compute_or_load(self, wav_file, text):
+    def compute_or_load(self, file_name, text, language):
        """Compute phonemes for the given text.

        If the phonemes are already cached, load them from cache.
        """
-        file_name = os.path.splitext(os.path.basename(wav_file))[0]
        file_ext = "_phoneme.npy"
        cache_path = os.path.join(self.cache_path, file_name + file_ext)
        try:
            ids = np.load(cache_path)
        except FileNotFoundError:
-            ids = self.tokenizer.text_to_ids(text)
+            ids = self.tokenizer.text_to_ids(text, language=language)
            np.save(cache_path, ids)
        return ids

@ -648,6 +697,7 @@ class F0Dataset:
        self,
        samples: Union[List[List], List[Dict]],
        ap: "AudioProcessor",
+        audio_config=None,  # pylint: disable=unused-argument
        verbose=False,
        cache_path: str = None,
        precompute_num_workers=0,
@ -669,11 +719,11 @@ class F0Dataset:

    def __getitem__(self, idx):
        item = self.samples[idx]
-        f0 = self.compute_or_load(item["audio_file"])
+        f0 = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
        if self.normalize_f0:
            assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
            f0 = self.normalize(f0)
-        return {"audio_file": item["audio_file"], "f0": f0}
+        return {"audio_unique_name": item["audio_unique_name"], "f0": f0}

    def __len__(self):
        return len(self.samples)
@ -705,8 +755,7 @@ class F0Dataset:
        return self.pad_id

    @staticmethod
-    def create_pitch_file_path(wav_file, cache_path):
-        file_name = os.path.splitext(os.path.basename(wav_file))[0]
+    def create_pitch_file_path(file_name, cache_path):
        pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
        return pitch_file

@ -744,11 +793,11 @@ class F0Dataset:
        pitch[zero_idxs] = 0.0
        return pitch

-    def compute_or_load(self, wav_file):
+    def compute_or_load(self, wav_file, audio_unique_name):
        """
        compute pitch and return a numpy array of pitch values
        """
-        pitch_file = self.create_pitch_file_path(wav_file, self.cache_path)
+        pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
        if not os.path.exists(pitch_file):
            pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
        else:
@ -756,17 +805,169 @@ class F0Dataset:
        return pitch.astype(np.float32)

    def collate_fn(self, batch):
-        audio_file = [item["audio_file"] for item in batch]
+        audio_unique_name = [item["audio_unique_name"] for item in batch]
        f0s = [item["f0"] for item in batch]
        f0_lens = [len(item["f0"]) for item in batch]
        f0_lens_max = max(f0_lens)
        f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id())
        for i, f0_len in enumerate(f0_lens):
            f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i])
-        return {"audio_file": audio_file, "f0": f0s_torch, "f0_lens": f0_lens}
+        return {"audio_unique_name": audio_unique_name, "f0": f0s_torch, "f0_lens": f0_lens}

    def print_logs(self, level: int = 0) -> None:
        indent = "\t" * level
        print("\n")
        print(f"{indent}> F0Dataset ")
        print(f"{indent}| > Number of instances : {len(self.samples)}")
+
+
+class EnergyDataset:
+    """Energy Dataset for computing Energy from wav files in CPU
+
+    Pre-compute Energy values for all the samples at initialization if `cache_path` is not None or already present. It
+    also computes the mean and std of Energy values if `normalize_Energy` is True.
+
+    Args:
+        samples (Union[List[List], List[Dict]]):
+            List of samples. Each sample is a list or a dict.
+
+        ap (AudioProcessor):
+            AudioProcessor to compute Energy from wav files.
+
+        cache_path (str):
+            Path to cache Energy values. If `cache_path` is already present or None, it skips the pre-computation.
+            Defaults to None.
+
+        precompute_num_workers (int):
+            Number of workers used for pre-computing the Energy values. Defaults to 0.
+
+        normalize_Energy (bool):
+            Whether to normalize Energy values by mean and std. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        samples: Union[List[List], List[Dict]],
+        ap: "AudioProcessor",
+        verbose=False,
+        cache_path: str = None,
+        precompute_num_workers=0,
+        normalize_energy=True,
+    ):
+        self.samples = samples
+        self.ap = ap
+        self.verbose = verbose
+        self.cache_path = cache_path
+        self.normalize_energy = normalize_energy
+        self.pad_id = 0.0
+        self.mean = None
+        self.std = None
+        if cache_path is not None and not os.path.exists(cache_path):
+            os.makedirs(cache_path)
+            self.precompute(precompute_num_workers)
+        if normalize_energy:
+            self.load_stats(cache_path)
+
+    def __getitem__(self, idx):
+        item = self.samples[idx]
+        energy = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
+        if self.normalize_energy:
+            assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
+            energy = self.normalize(energy)
+        return {"audio_unique_name": item["audio_unique_name"], "energy": energy}
+
+    def __len__(self):
+        return len(self.samples)
+
+    def precompute(self, num_workers=0):
+        print("[*] Pre-computing energys...")
+        with tqdm.tqdm(total=len(self)) as pbar:
+            batch_size = num_workers if num_workers > 0 else 1
+            # we do not normalize at preproessing
+            normalize_energy = self.normalize_energy
+            self.normalize_energy = False
+            dataloder = torch.utils.data.DataLoader(
+                batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
+            )
+            computed_data = []
+            for batch in dataloder:
+                energy = batch["energy"]
+                computed_data.append(e for e in energy)
+                pbar.update(batch_size)
+            self.normalize_energy = normalize_energy
+
+        if self.normalize_energy:
+            computed_data = [tensor for batch in computed_data for tensor in batch]  # flatten
+            energy_mean, energy_std = self.compute_energy_stats(computed_data)
+            energy_stats = {"mean": energy_mean, "std": energy_std}
+            np.save(os.path.join(self.cache_path, "energy_stats"), energy_stats, allow_pickle=True)
+
+    def get_pad_id(self):
+        return self.pad_id
+
+    @staticmethod
+    def create_energy_file_path(wav_file, cache_path):
+        file_name = os.path.splitext(os.path.basename(wav_file))[0]
+        energy_file = os.path.join(cache_path, file_name + "_energy.npy")
+        return energy_file
+
+    @staticmethod
+    def _compute_and_save_energy(ap, wav_file, energy_file=None):
+        wav = ap.load_wav(wav_file)
+        energy = calculate_energy(wav, fft_size=ap.fft_size, hop_length=ap.hop_length, win_length=ap.win_length)
+        if energy_file:
+            np.save(energy_file, energy)
+        return energy
+
+    @staticmethod
+    def compute_energy_stats(energy_vecs):
+        nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in energy_vecs])
+        mean, std = np.mean(nonzeros), np.std(nonzeros)
+        return mean, std
+
+    def load_stats(self, cache_path):
+        stats_path = os.path.join(cache_path, "energy_stats.npy")
+        stats = np.load(stats_path, allow_pickle=True).item()
+        self.mean = stats["mean"].astype(np.float32)
+        self.std = stats["std"].astype(np.float32)
+
+    def normalize(self, energy):
+        zero_idxs = np.where(energy == 0.0)[0]
+        energy = energy - self.mean
+        energy = energy / self.std
+        energy[zero_idxs] = 0.0
+        return energy
+
+    def denormalize(self, energy):
+        zero_idxs = np.where(energy == 0.0)[0]
+        energy *= self.std
+        energy += self.mean
+        energy[zero_idxs] = 0.0
+        return energy
+
+    def compute_or_load(self, wav_file, audio_unique_name):
+        """
+        compute energy and return a numpy array of energy values
+        """
+        energy_file = self.create_energy_file_path(audio_unique_name, self.cache_path)
+        if not os.path.exists(energy_file):
+            energy = self._compute_and_save_energy(self.ap, wav_file, energy_file)
+        else:
+            energy = np.load(energy_file)
+        return energy.astype(np.float32)
+
+    def collate_fn(self, batch):
+        audio_unique_name = [item["audio_unique_name"] for item in batch]
+        energys = [item["energy"] for item in batch]
+        energy_lens = [len(item["energy"]) for item in batch]
+        energy_lens_max = max(energy_lens)
+        energys_torch = torch.LongTensor(len(energys), energy_lens_max).fill_(self.get_pad_id())
+        for i, energy_len in enumerate(energy_lens):
+            energys_torch[i, :energy_len] = torch.LongTensor(energys[i])
+        return {"audio_unique_name": audio_unique_name, "energy": energys_torch, "energy_lens": energy_lens}
+
+    def print_logs(self, level: int = 0) -> None:
+        indent = "\t" * level
+        print("\n")
+        print(f"{indent}> energyDataset ")
+        print(f"{indent}| > Number of instances : {len(self.samples)}")
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@ -13,8 +13,56 @@ from tqdm import tqdm
 ########################


+def cml_tts(root_path, meta_file, ignored_speakers=None):
+    """Normalizes the CML-TTS meta data file to TTS format
+    https://github.com/freds0/CML-TTS-Dataset/"""
+    filepath = os.path.join(root_path, meta_file)
+    # ensure there are 4 columns for every line
+    with open(filepath, "r", encoding="utf8") as f:
+        lines = f.readlines()
+    num_cols = len(lines[0].split("|"))  # take the first row as reference
+    for idx, line in enumerate(lines[1:]):
+        if len(line.split("|")) != num_cols:
+            print(f" > Missing column in line {idx + 1} -> {line.strip()}")
+    # load metadata
+    metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
+    assert all(x in metadata.columns for x in ["wav_filename", "transcript"])
+    client_id = None if "client_id" in metadata.columns else "default"
+    emotion_name = None if "emotion_name" in metadata.columns else "neutral"
+    items = []
+    not_found_counter = 0
+    for row in metadata.itertuples():
+        if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers:
+            continue
+        audio_path = os.path.join(root_path, row.wav_filename)
+        if not os.path.exists(audio_path):
+            not_found_counter += 1
+            continue
+        items.append(
+            {
+                "text": row.transcript,
+                "audio_file": audio_path,
+                "speaker_name": client_id if client_id is not None else row.client_id,
+                "emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
+                "root_path": root_path,
+            }
+        )
+    if not_found_counter > 0:
+        print(f" | > [!] {not_found_counter} files not found")
+    return items
+
+
 def coqui(root_path, meta_file, ignored_speakers=None):
    """Interal dataset formatter."""
+    filepath = os.path.join(root_path, meta_file)
+    # ensure there are 4 columns for every line
+    with open(filepath, "r", encoding="utf8") as f:
+        lines = f.readlines()
+    num_cols = len(lines[0].split("|"))  # take the first row as reference
+    for idx, line in enumerate(lines[1:]):
+        if len(line.split("|")) != num_cols:
+            print(f" > Missing column in line {idx + 1} -> {line.strip()}")
+    # load metadata
    metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
    assert all(x in metadata.columns for x in ["audio_file", "text"])
    speaker_name = None if "speaker_name" in metadata.columns else "coqui"
@ -97,9 +145,9 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
        meta_files (str):  list of meta files to be used in the training. If None, finds all the csv files
            recursively. Defaults to None
    """
-    speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
+    speaker_regex = re.compile(f"by_book{os.sep}(male|female){os.sep}(?P<speaker_name>[^{os.sep}]+){os.sep}")
    if not meta_files:
-        csv_files = glob(root_path + "/**/metadata.csv", recursive=True)
+        csv_files = glob(root_path + f"{os.sep}**{os.sep}metadata.csv", recursive=True)
    else:
        csv_files = meta_files

@ -232,7 +280,7 @@ def css10(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
            cols = line.split("|")
            wav_file = os.path.join(root_path, cols[0])
            text = cols[1]
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
    return items


@ -246,7 +294,7 @@ def nancy(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
            utt_id = line.split()[1]
            text = line[line.find('"') + 1 : line.rfind('"') - 1]
            wav_file = os.path.join(root_path, "wavn", utt_id + ".wav")
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
    return items


@ -578,3 +626,30 @@ def kokoro(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
            text = cols[2].replace(" ", "")
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
    return items
+
+
+def kss(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
+    """Korean single-speaker dataset from https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "kss"
+    with open(txt_file, "r", encoding="utf-8") as ttf:
+        for line in ttf:
+            cols = line.split("|")
+            wav_file = os.path.join(root_path, cols[0])
+            text = cols[2]  # cols[1] => 6월, cols[2] => 유월
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+    return items
+
+
+def bel_tts_formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "bel_tts"
+    with open(txt_file, "r", encoding="utf-8") as ttf:
+        for line in ttf:
+            cols = line.split("|")
+            wav_file = os.path.join(root_path, cols[0])
+            text = cols[1]
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+    return items
--- a/TTS/tts/layers/bark/init.py
+++ b/TTS/tts/layers/bark/init.py
--- a/TTS/tts/layers/bark/hubert/init.py
+++ b/TTS/tts/layers/bark/hubert/init.py
--- a/TTS/tts/layers/bark/hubert/hubert_manager.py
+++ b/TTS/tts/layers/bark/hubert/hubert_manager.py
@ -0,0 +1,35 @@
+# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
+
+import os.path
+import shutil
+import urllib.request
+
+import huggingface_hub
+
+
+class HubertManager:
+    @staticmethod
+    def make_sure_hubert_installed(
+        download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
+    ):
+        if not os.path.isfile(model_path):
+            print("Downloading HuBERT base model")
+            urllib.request.urlretrieve(download_url, model_path)
+            print("Downloaded HuBERT")
+            return model_path
+        return None
+
+    @staticmethod
+    def make_sure_tokenizer_installed(
+        model: str = "quantifier_hubert_base_ls960_14.pth",
+        repo: str = "GitMylo/bark-voice-cloning",
+        model_path: str = "",
+    ):
+        model_dir = os.path.dirname(model_path)
+        if not os.path.isfile(model_path):
+            print("Downloading HuBERT custom tokenizer")
+            huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
+            shutil.move(os.path.join(model_dir, model), model_path)
+            print("Downloaded tokenizer")
+            return model_path
+        return None
--- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py
+++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py
@ -0,0 +1,82 @@
+"""
+Modified HuBERT model without kmeans.
+Original author: https://github.com/lucidrains/
+Modified by: https://www.github.com/gitmylo/
+License: MIT
+"""
+
+# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
+
+import logging
+from pathlib import Path
+
+import torch
+from einops import pack, unpack
+from torch import nn
+from torchaudio.functional import resample
+from transformers import HubertModel
+
+
+def round_down_nearest_multiple(num, divisor):
+    return num // divisor * divisor
+
+
+def curtail_to_multiple(t, mult, from_left=False):
+    data_len = t.shape[-1]
+    rounded_seq_len = round_down_nearest_multiple(data_len, mult)
+    seq_slice = slice(None, rounded_seq_len) if not from_left else slice(-rounded_seq_len, None)
+    return t[..., seq_slice]
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    return val if exists(val) else d
+
+
+class CustomHubert(nn.Module):
+    """
+    checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
+    or you can train your own
+    """
+
+    def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None):
+        super().__init__()
+        self.target_sample_hz = target_sample_hz
+        self.seq_len_multiple_of = seq_len_multiple_of
+        self.output_layer = output_layer
+        if device is not None:
+            self.to(device)
+        self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
+        if device is not None:
+            self.model.to(device)
+        self.model.eval()
+
+    @property
+    def groups(self):
+        return 1
+
+    @torch.no_grad()
+    def forward(self, wav_input, flatten=True, input_sample_hz=None):
+        device = wav_input.device
+
+        if exists(input_sample_hz):
+            wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
+
+        if exists(self.seq_len_multiple_of):
+            wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
+
+        outputs = self.model.forward(
+            wav_input,
+            output_hidden_states=True,
+        )
+        embed = outputs["hidden_states"][self.output_layer]
+        embed, packed_shape = pack([embed], "* d")
+        codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
+        if flatten:
+            return codebook_indices
+
+        (codebook_indices,) = unpack(codebook_indices, packed_shape, "*")
+        return codebook_indices
--- a/TTS/tts/layers/bark/hubert/tokenizer.py
+++ b/TTS/tts/layers/bark/hubert/tokenizer.py
@ -0,0 +1,195 @@
+"""
+Custom tokenizer model.
+Author: https://www.github.com/gitmylo/
+License: MIT
+"""
+
+import json
+import os.path
+from zipfile import ZipFile
+
+import numpy
+import torch
+from torch import nn, optim
+
+
+class HubertTokenizer(nn.Module):
+    def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
+        super().__init__()
+        next_size = input_size
+        if version == 0:
+            self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
+            next_size = hidden_size
+        if version == 1:
+            self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
+            self.intermediate = nn.Linear(hidden_size, 4096)
+            next_size = 4096
+
+        self.fc = nn.Linear(next_size, output_size)
+        self.softmax = nn.LogSoftmax(dim=1)
+        self.optimizer: optim.Optimizer = None
+        self.lossfunc = nn.CrossEntropyLoss()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self.version = version
+
+    def forward(self, x):
+        x, _ = self.lstm(x)
+        if self.version == 1:
+            x = self.intermediate(x)
+        x = self.fc(x)
+        x = self.softmax(x)
+        return x
+
+    @torch.no_grad()
+    def get_token(self, x):
+        """
+        Used to get the token for the first
+        :param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
+        :return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
+        """
+        return torch.argmax(self(x), dim=1)
+
+    def prepare_training(self):
+        self.optimizer = optim.Adam(self.parameters(), 0.001)
+
+    def train_step(self, x_train, y_train, log_loss=False):
+        # y_train = y_train[:-1]
+        # y_train = y_train[1:]
+
+        optimizer = self.optimizer
+        lossfunc = self.lossfunc
+        # Zero the gradients
+        self.zero_grad()
+
+        # Forward pass
+        y_pred = self(x_train)
+
+        y_train_len = len(y_train)
+        y_pred_len = y_pred.shape[0]
+
+        if y_train_len > y_pred_len:
+            diff = y_train_len - y_pred_len
+            y_train = y_train[diff:]
+        elif y_train_len < y_pred_len:
+            diff = y_pred_len - y_train_len
+            y_pred = y_pred[:-diff, :]
+
+        y_train_hot = torch.zeros(len(y_train), self.output_size)
+        y_train_hot[range(len(y_train)), y_train] = 1
+        y_train_hot = y_train_hot.to("cuda")
+
+        # Calculate the loss
+        loss = lossfunc(y_pred, y_train_hot)
+
+        # Print loss
+        if log_loss:
+            print("Loss", loss.item())
+
+        # Backward pass
+        loss.backward()
+
+        # Update the weights
+        optimizer.step()
+
+    def save(self, path):
+        info_path = ".".join(os.path.basename(path).split(".")[:-1]) + "/.info"
+        torch.save(self.state_dict(), path)
+        data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
+        with ZipFile(path, "a") as model_zip:
+            model_zip.writestr(info_path, data_from_model.save())
+            model_zip.close()
+
+    @staticmethod
+    def load_from_checkpoint(path, map_location=None):
+        old = True
+        with ZipFile(path) as model_zip:
+            filesMatch = [file for file in model_zip.namelist() if file.endswith("/.info")]
+            file = filesMatch[0] if filesMatch else None
+            if file:
+                old = False
+                data_from_model = Data.load(model_zip.read(file).decode("utf-8"))
+            model_zip.close()
+        if old:
+            model = HubertTokenizer()
+        else:
+            model = HubertTokenizer(
+                data_from_model.hidden_size,
+                data_from_model.input_size,
+                data_from_model.output_size,
+                data_from_model.version,
+            )
+        model.load_state_dict(torch.load(path, map_location=map_location))
+        if map_location:
+            model = model.to(map_location)
+        return model
+
+
+class Data:
+    input_size: int
+    hidden_size: int
+    output_size: int
+    version: int
+
+    def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self.version = version
+
+    @staticmethod
+    def load(string):
+        data = json.loads(string)
+        return Data(data["input_size"], data["hidden_size"], data["output_size"], data["version"])
+
+    def save(self):
+        data = {
+            "input_size": self.input_size,
+            "hidden_size": self.hidden_size,
+            "output_size": self.output_size,
+            "version": self.version,
+        }
+        return json.dumps(data)
+
+
+def auto_train(data_path, save_path="model.pth", load_model: str = None, save_epochs=1):
+    data_x, data_y = [], []
+
+    if load_model and os.path.isfile(load_model):
+        print("Loading model from", load_model)
+        model_training = HubertTokenizer.load_from_checkpoint(load_model, "cuda")
+    else:
+        print("Creating new model.")
+        model_training = HubertTokenizer(version=1).to("cuda")  # Settings for the model to run without lstm
+    save_path = os.path.join(data_path, save_path)
+    base_save_path = ".".join(save_path.split(".")[:-1])
+
+    sem_string = "_semantic.npy"
+    feat_string = "_semantic_features.npy"
+
+    ready = os.path.join(data_path, "ready")
+    for input_file in os.listdir(ready):
+        full_path = os.path.join(ready, input_file)
+        if input_file.endswith(sem_string):
+            data_y.append(numpy.load(full_path))
+        elif input_file.endswith(feat_string):
+            data_x.append(numpy.load(full_path))
+    model_training.prepare_training()
+
+    epoch = 1
+
+    while 1:
+        for _ in range(save_epochs):
+            j = 0
+            for x, y in zip(data_x, data_y):
+                model_training.train_step(
+                    torch.tensor(x).to("cuda"), torch.tensor(y).to("cuda"), j % 50 == 0
+                )  # Print loss every 50 steps
+                j += 1
+        save_p = save_path
+        save_p_2 = f"{base_save_path}_epoch_{epoch}.pth"
+        model_training.save(save_p)
+        model_training.save(save_p_2)
+        print(f"Epoch {epoch} completed")
+        epoch += 1
--- a/TTS/tts/layers/bark/inference_funcs.py
+++ b/TTS/tts/layers/bark/inference_funcs.py
@ -0,0 +1,606 @@
+import logging
+import os
+import re
+from glob import glob
+from typing import Dict, List
+
+import librosa
+import numpy as np
+import torch
+import torchaudio
+import tqdm
+from encodec.utils import convert_audio
+from scipy.special import softmax
+from torch.nn import functional as F
+
+from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
+from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
+from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
+from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode
+
+logger = logging.getLogger(__name__)
+
+
+def _tokenize(tokenizer, text):
+    return tokenizer.encode(text, add_special_tokens=False)
+
+
+def _detokenize(tokenizer, enc_text):
+    return tokenizer.decode(enc_text)
+
+
+def _normalize_whitespace(text):
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def get_voices(extra_voice_dirs: List[str] = []):  # pylint: disable=dangerous-default-value
+    dirs = extra_voice_dirs
+    voices: Dict[str, List[str]] = {}
+    for d in dirs:
+        subs = os.listdir(d)
+        for sub in subs:
+            subj = os.path.join(d, sub)
+            if os.path.isdir(subj):
+                voices[sub] = list(glob(f"{subj}/*.npz"))
+                # fetch audio files if no npz files are found
+                if len(voices[sub]) == 0:
+                    voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
+    return voices
+
+
+def load_npz(npz_file):
+    x_history = np.load(npz_file)
+    semantic = x_history["semantic_prompt"]
+    coarse = x_history["coarse_prompt"]
+    fine = x_history["fine_prompt"]
+    return semantic, coarse, fine
+
+
+def load_voice(model, voice: str, extra_voice_dirs: List[str] = []):  # pylint: disable=dangerous-default-value
+    if voice == "random":
+        return None, None, None
+
+    voices = get_voices(extra_voice_dirs)
+    paths = voices[voice]
+
+    # bark only uses a single sample for cloning
+    if len(paths) > 1:
+        raise ValueError(f"Voice {voice} has multiple paths: {paths}")
+
+    try:
+        path = voices[voice]
+    except KeyError as e:
+        raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e
+
+    if len(paths) == 1 and paths[0].endswith(".npz"):
+        return load_npz(path[0])
+
+    audio_path = paths[0]
+    # replace the file extension with .npz
+    output_path = os.path.splitext(audio_path)[0] + ".npz"
+    generate_voice(audio=audio_path, model=model, output_path=output_path)
+    return load_voice(model, voice, extra_voice_dirs)
+
+
+def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
+    zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
+    total_frames = 1 + int((len(audio) - frame_length) / hop_length)
+    return zero_crossings / total_frames
+
+
+def compute_spectral_contrast(audio_data, sample_rate, n_bands=6, fmin=200.0):
+    spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sample_rate, n_bands=n_bands, fmin=fmin)
+    return np.mean(spectral_contrast)
+
+
+def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):
+    stft = librosa.stft(audio_data)
+    power_spectrogram = np.abs(stft) ** 2
+    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=stft.shape[0])
+    bass_mask = frequencies <= max_bass_freq
+    bass_energy = power_spectrogram[np.ix_(bass_mask, np.arange(power_spectrogram.shape[1]))].mean()
+    return bass_energy
+
+
+def generate_voice(
+    audio,
+    model,
+    output_path,
+):
+    """Generate a new voice from a given audio and text prompt.
+
+    Args:
+        audio (np.ndarray): The audio to use as a base for the new voice.
+        text (str): Transcription of the audio you are clonning.
+        model (BarkModel): The BarkModel to use for generating the new voice.
+        output_path (str): The path to save the generated voice to.
+    """
+    if isinstance(audio, str):
+        audio, sr = torchaudio.load(audio)
+        audio = convert_audio(audio, sr, model.config.sample_rate, model.encodec.channels)
+        audio = audio.unsqueeze(0).to(model.device)
+
+    with torch.no_grad():
+        encoded_frames = model.encodec.encode(audio)
+    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]
+
+    # move codes to cpu
+    codes = codes.cpu().numpy()
+
+    # generate semantic tokens
+    # Load the HuBERT model
+    hubert_manager = HubertManager()
+    # hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
+    hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
+
+    hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)
+
+    # Load the CustomTokenizer model
+    tokenizer = HubertTokenizer.load_from_checkpoint(
+        model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"], map_location=model.device
+    )
+    # semantic_tokens = model.text_to_semantic(
+    #     text, max_gen_duration_s=seconds, top_k=50, top_p=0.95, temp=0.7
+    # )  # not 100%
+    semantic_vectors = hubert_model.forward(audio[0], input_sample_hz=model.config.sample_rate)
+    semantic_tokens = tokenizer.get_token(semantic_vectors)
+    semantic_tokens = semantic_tokens.cpu().numpy()
+
+    np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
+
+
+def generate_text_semantic(
+    text,
+    model,
+    history_prompt=None,
+    temp=0.7,
+    top_k=None,
+    top_p=None,
+    silent=False,
+    min_eos_p=0.2,
+    max_gen_duration_s=None,
+    allow_early_stop=True,
+    base=None,
+    use_kv_caching=True,
+    **kwargs,  # pylint: disable=unused-argument
+):
+    """Generate semantic tokens from text.
+
+    Args:
+        text (str): The text to generate semantic tokens from.
+        model (BarkModel): The BarkModel to use for generating the semantic tokens.
+        history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
+        temp (float): The temperature to use for the generation.
+        top_k (int): The number of top tokens to consider for the generation.
+        top_p (float): The cumulative probability to consider for the generation.
+        silent (bool): Whether to silence the tqdm progress bar.
+        min_eos_p (float): The minimum probability to consider for the end of sentence token.
+        max_gen_duration_s (float): The maximum duration in seconds to generate for.
+        allow_early_stop (bool): Whether to allow the generation to stop early.
+        base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
+        use_kv_caching (bool): Whether to use key-value caching for the generation.
+        **kwargs: Additional keyword arguments. They are ignored.
+
+    Returns:
+        np.ndarray: The generated semantic tokens.
+    """
+    assert isinstance(text, str)
+    text = _normalize_whitespace(text)
+    assert len(text.strip()) > 0
+    if all(v is not None for v in history_prompt) or base is not None:
+        if history_prompt is not None:
+            semantic_history = history_prompt[0]
+        if base is not None:
+            semantic_history = base[0]
+        assert (
+            isinstance(semantic_history, np.ndarray)
+            and len(semantic_history.shape) == 1
+            and len(semantic_history) > 0
+            and semantic_history.min() >= 0
+            and semantic_history.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
+        )
+    else:
+        semantic_history = None
+    encoded_text = np.array(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET
+    if len(encoded_text) > 256:
+        p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
+        logger.warning(f"warning, text too long, lopping of last {p}%")
+        encoded_text = encoded_text[:256]
+    encoded_text = np.pad(
+        encoded_text,
+        (0, 256 - len(encoded_text)),
+        constant_values=model.config.TEXT_PAD_TOKEN,
+        mode="constant",
+    )
+    if semantic_history is not None:
+        semantic_history = semantic_history.astype(np.int64)
+        # lop off if history is too long, pad if needed
+        semantic_history = semantic_history[-256:]
+        semantic_history = np.pad(
+            semantic_history,
+            (0, 256 - len(semantic_history)),
+            constant_values=model.config.SEMANTIC_PAD_TOKEN,
+            mode="constant",
+        )
+    else:
+        semantic_history = np.array([model.config.SEMANTIC_PAD_TOKEN] * 256)
+    x = torch.from_numpy(
+        np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
+    )[None]
+    assert x.shape[1] == 256 + 256 + 1
+    with inference_mode():
+        x = x.to(model.device)
+        n_tot_steps = 768
+        # custom tqdm updates since we don't know when eos will occur
+        pbar = tqdm.tqdm(disable=silent, total=100)
+        pbar_state = 0
+        tot_generated_duration_s = 0
+        kv_cache = None
+        for n in range(n_tot_steps):
+            if use_kv_caching and kv_cache is not None:
+                x_input = x[:, [-1]]
+            else:
+                x_input = x
+            logits, kv_cache = model.semantic_model(
+                x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
+            )
+            relevant_logits = logits[0, 0, : model.config.SEMANTIC_VOCAB_SIZE]
+            if allow_early_stop:
+                relevant_logits = torch.hstack(
+                    (relevant_logits, logits[0, 0, [model.config.SEMANTIC_PAD_TOKEN]])
+                )  # eos
+            if top_p is not None:
+                # faster to convert to numpy
+                logits_device = relevant_logits.device
+                logits_dtype = relevant_logits.type()
+                relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+                sorted_indices = np.argsort(relevant_logits)[::-1]
+                sorted_logits = relevant_logits[sorted_indices]
+                cumulative_probs = np.cumsum(softmax(sorted_logits))
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+                sorted_indices_to_remove[0] = False
+                relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+                relevant_logits = torch.from_numpy(relevant_logits)
+                relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
+            if top_k is not None:
+                v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+                relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+            probs = torch.softmax(relevant_logits / temp, dim=-1)
+            item_next = torch.multinomial(probs, num_samples=1)
+            if allow_early_stop and (
+                item_next == model.config.SEMANTIC_VOCAB_SIZE or (min_eos_p is not None and probs[-1] >= min_eos_p)
+            ):
+                # eos found, so break
+                pbar.update(100 - pbar_state)
+                break
+            x = torch.cat((x, item_next[None]), dim=1)
+            tot_generated_duration_s += 1 / model.config.SEMANTIC_RATE_HZ
+            if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
+                pbar.update(100 - pbar_state)
+                break
+            if n == n_tot_steps - 1:
+                pbar.update(100 - pbar_state)
+                break
+            del logits, relevant_logits, probs, item_next
+            req_pbar_state = np.min([100, int(round(100 * n / n_tot_steps))])
+            if req_pbar_state > pbar_state:
+                pbar.update(req_pbar_state - pbar_state)
+            pbar_state = req_pbar_state
+        pbar.close()
+        out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
+    assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
+    clear_cuda_cache()
+    return out
+
+
+def _flatten_codebooks(arr, offset_size):
+    assert len(arr.shape) == 2
+    arr = arr.copy()
+    if offset_size is not None:
+        for n in range(1, arr.shape[0]):
+            arr[n, :] += offset_size * n
+    flat_arr = arr.ravel("F")
+    return flat_arr
+
+
+def generate_coarse(
+    x_semantic,
+    model,
+    history_prompt=None,
+    temp=0.7,
+    top_k=None,
+    top_p=None,
+    silent=False,
+    max_coarse_history=630,  # min 60 (faster), max 630 (more context)
+    sliding_window_len=60,
+    base=None,
+    use_kv_caching=True,
+):
+    """Generate coarse audio codes from semantic tokens.
+
+    Args:
+        x_semantic (np.ndarray): The semantic tokens to generate coarse audio codes from.
+        model (BarkModel): The BarkModel to use for generating the coarse audio codes.
+        history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
+        temp (float): The temperature to use for the generation.
+        top_k (int): The number of top tokens to consider for the generation.
+        top_p (float): The cumulative probability to consider for the generation.
+        silent (bool): Whether to silence the tqdm progress bar.
+        max_coarse_history (int): The maximum number of coarse audio codes to use as history.
+        sliding_window_len (int): The length of the sliding window to use for the generation.
+        base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
+        use_kv_caching (bool): Whether to use key-value caching for the generation.
+
+    Returns:
+        np.ndarray: The generated coarse audio codes.
+    """
+    assert (
+        isinstance(x_semantic, np.ndarray)
+        and len(x_semantic.shape) == 1
+        and len(x_semantic) > 0
+        and x_semantic.min() >= 0
+        and x_semantic.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
+    )
+    assert 60 <= max_coarse_history <= 630
+    assert max_coarse_history + sliding_window_len <= 1024 - 256
+    semantic_to_coarse_ratio = (
+        model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
+    )
+    max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+    if all(v is not None for v in history_prompt) or base is not None:
+        if history_prompt is not None:
+            x_history = history_prompt
+            x_semantic_history = x_history[0]
+            x_coarse_history = x_history[1]
+        if base is not None:
+            x_semantic_history = base[0]
+            x_coarse_history = base[1]
+        assert (
+            isinstance(x_semantic_history, np.ndarray)
+            and len(x_semantic_history.shape) == 1
+            and len(x_semantic_history) > 0
+            and x_semantic_history.min() >= 0
+            and x_semantic_history.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
+            and isinstance(x_coarse_history, np.ndarray)
+            and len(x_coarse_history.shape) == 2
+            and x_coarse_history.shape[0] == model.config.N_COARSE_CODEBOOKS
+            and x_coarse_history.shape[-1] >= 0
+            and x_coarse_history.min() >= 0
+            and x_coarse_history.max() <= model.config.CODEBOOK_SIZE - 1
+            and (
+                round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
+                == round(semantic_to_coarse_ratio / model.config.N_COARSE_CODEBOOKS, 1)
+            )
+        )
+        x_coarse_history = (
+            _flatten_codebooks(x_coarse_history, model.config.CODEBOOK_SIZE) + model.config.SEMANTIC_VOCAB_SIZE
+        )
+        # trim histories correctly
+        n_semantic_hist_provided = np.min(
+            [
+                max_semantic_history,
+                len(x_semantic_history) - len(x_semantic_history) % 2,
+                int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
+            ]
+        )
+        n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
+        x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32)
+        x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
+        # TODO: bit of a hack for time alignment (sounds better)
+        x_coarse_history = x_coarse_history[:-2]
+    else:
+        x_semantic_history = np.array([], dtype=np.int32)
+        x_coarse_history = np.array([], dtype=np.int32)
+    # start loop
+    n_steps = int(
+        round(
+            np.floor(len(x_semantic) * semantic_to_coarse_ratio / model.config.N_COARSE_CODEBOOKS)
+            * model.config.N_COARSE_CODEBOOKS
+        )
+    )
+    assert n_steps > 0 and n_steps % model.config.N_COARSE_CODEBOOKS == 0
+    x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
+    x_coarse = x_coarse_history.astype(np.int32)
+    base_semantic_idx = len(x_semantic_history)
+    with inference_mode():
+        x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
+        x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
+        n_window_steps = int(np.ceil(n_steps / sliding_window_len))
+        n_step = 0
+        for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
+            semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio))
+            # pad from right side
+            x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :]
+            x_in = x_in[:, :256]
+            x_in = F.pad(
+                x_in,
+                (0, 256 - x_in.shape[-1]),
+                "constant",
+                model.config.COARSE_SEMANTIC_PAD_TOKEN,
+            )
+            x_in = torch.hstack(
+                [
+                    x_in,
+                    torch.tensor([model.config.COARSE_INFER_TOKEN])[None].to(model.device),
+                    x_coarse_in[:, -max_coarse_history:],
+                ]
+            )
+            kv_cache = None
+            for _ in range(sliding_window_len):
+                if n_step >= n_steps:
+                    continue
+                is_major_step = n_step % model.config.N_COARSE_CODEBOOKS == 0
+
+                if use_kv_caching and kv_cache is not None:
+                    x_input = x_in[:, [-1]]
+                else:
+                    x_input = x_in
+
+                logits, kv_cache = model.coarse_model(x_input, use_cache=use_kv_caching, past_kv=kv_cache)
+                logit_start_idx = (
+                    model.config.SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * model.config.CODEBOOK_SIZE
+                )
+                logit_end_idx = model.config.SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * model.config.CODEBOOK_SIZE
+                relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
+                if top_p is not None:
+                    # faster to convert to numpy
+                    logits_device = relevant_logits.device
+                    logits_dtype = relevant_logits.type()
+                    relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+                    sorted_indices = np.argsort(relevant_logits)[::-1]
+                    sorted_logits = relevant_logits[sorted_indices]
+                    cumulative_probs = np.cumsum(torch.nn.functional.softmax(sorted_logits))
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+                    sorted_indices_to_remove[0] = False
+                    relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+                    relevant_logits = torch.from_numpy(relevant_logits)
+                    relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
+                if top_k is not None:
+                    v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+                    relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+                probs = torch.nn.functional.softmax(relevant_logits / temp, dim=-1)
+                item_next = torch.multinomial(probs, num_samples=1)
+                item_next += logit_start_idx
+                x_coarse_in = torch.cat((x_coarse_in, item_next[None]), dim=1)
+                x_in = torch.cat((x_in, item_next[None]), dim=1)
+                del logits, relevant_logits, probs, item_next
+                n_step += 1
+            del x_in
+        del x_semantic_in
+    gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :]
+    del x_coarse_in
+    assert len(gen_coarse_arr) == n_steps
+    gen_coarse_audio_arr = (
+        gen_coarse_arr.reshape(-1, model.config.N_COARSE_CODEBOOKS).T - model.config.SEMANTIC_VOCAB_SIZE
+    )
+    for n in range(1, model.config.N_COARSE_CODEBOOKS):
+        gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
+    clear_cuda_cache()
+    return gen_coarse_audio_arr
+
+
+def generate_fine(
+    x_coarse_gen,
+    model,
+    history_prompt=None,
+    temp=0.5,
+    silent=True,
+    base=None,
+):
+    """Generate full audio codes from coarse audio codes.
+
+    Args:
+        x_coarse_gen (np.ndarray): The coarse audio codes to generate full audio codes from.
+        model (BarkModel): The BarkModel to use for generating the full audio codes.
+        history_prompt (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a prompt for the generation.
+        temp (float): The temperature to use for the generation.
+        silent (bool): Whether to silence the tqdm progress bar.
+        base (tuple): A tuple of (semantic_history, coarse_history, fine_history) to use as a base for the generation.
+
+    Returns:
+        np.ndarray: The generated full audio codes.
+    """
+    assert (
+        isinstance(x_coarse_gen, np.ndarray)
+        and len(x_coarse_gen.shape) == 2
+        and 1 <= x_coarse_gen.shape[0] <= model.config.N_FINE_CODEBOOKS - 1
+        and x_coarse_gen.shape[1] > 0
+        and x_coarse_gen.min() >= 0
+        and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
+    )
+    if all(v is not None for v in history_prompt) or base is not None:
+        if history_prompt is not None:
+            x_fine_history = history_prompt[2]
+        if base is not None:
+            x_fine_history = base[2]
+        assert (
+            isinstance(x_fine_history, np.ndarray)
+            and len(x_fine_history.shape) == 2
+            and x_fine_history.shape[0] == model.config.N_FINE_CODEBOOKS
+            and x_fine_history.shape[1] >= 0
+            and x_fine_history.min() >= 0
+            and x_fine_history.max() <= model.config.CODEBOOK_SIZE - 1
+        )
+    else:
+        x_fine_history = None
+    n_coarse = x_coarse_gen.shape[0]
+    # make input arr
+    in_arr = np.vstack(
+        [
+            x_coarse_gen,
+            np.zeros((model.config.N_FINE_CODEBOOKS - n_coarse, x_coarse_gen.shape[1]))
+            + model.config.CODEBOOK_SIZE,  # padding
+        ]
+    ).astype(np.int32)
+    # prepend history if available (max 512)
+    if x_fine_history is not None:
+        x_fine_history = x_fine_history.astype(np.int32)
+        in_arr = np.hstack(
+            [
+                x_fine_history[:, -512:].astype(np.int32),
+                in_arr,
+            ]
+        )
+        n_history = x_fine_history[:, -512:].shape[1]
+    else:
+        n_history = 0
+    n_remove_from_end = 0
+    # need to pad if too short (since non-causal model)
+    if in_arr.shape[1] < 1024:
+        n_remove_from_end = 1024 - in_arr.shape[1]
+        in_arr = np.hstack(
+            [
+                in_arr,
+                np.zeros((model.config.N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32)
+                + model.config.CODEBOOK_SIZE,
+            ]
+        )
+    # we can be lazy about fractional loop and just keep overwriting codebooks
+    n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
+    with inference_mode():
+        in_arr = torch.tensor(in_arr.T).to(model.device)
+        for n in tqdm.tqdm(range(n_loops), disable=silent):
+            start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
+            start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512])
+            rel_start_fill_idx = start_fill_idx - start_idx
+            in_buffer = in_arr[start_idx : start_idx + 1024, :][None]
+            for nn in range(n_coarse, model.config.N_FINE_CODEBOOKS):
+                logits = model.fine_model(nn, in_buffer)
+                if temp is None:
+                    relevant_logits = logits[0, rel_start_fill_idx:, : model.config.CODEBOOK_SIZE]
+                    codebook_preds = torch.argmax(relevant_logits, -1)
+                else:
+                    relevant_logits = logits[0, :, : model.config.CODEBOOK_SIZE] / temp
+                    probs = F.softmax(relevant_logits, dim=-1)
+                    codebook_preds = torch.hstack(
+                        [torch.multinomial(probs[n], num_samples=1) for n in range(rel_start_fill_idx, 1024)]
+                    )
+                in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds
+                del logits, codebook_preds
+            # transfer over info into model_in and convert to numpy
+            for nn in range(n_coarse, model.config.N_FINE_CODEBOOKS):
+                in_arr[start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn] = in_buffer[
+                    0, rel_start_fill_idx:, nn
+                ]
+            del in_buffer
+        gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T
+        del in_arr
+    gen_fine_arr = gen_fine_arr[:, n_history:]
+    if n_remove_from_end > 0:
+        gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
+    assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
+    clear_cuda_cache()
+    return gen_fine_arr
+
+
+def codec_decode(fine_tokens, model):
+    """Turn quantized audio codes into audio array using encodec."""
+    arr = torch.from_numpy(fine_tokens)[None]
+    arr = arr.to(model.device)
+    arr = arr.transpose(0, 1)
+    emb = model.encodec.quantizer.decode(arr)
+    out = model.encodec.decoder(emb)
+    audio_arr = out.detach().cpu().numpy().squeeze()
+    return audio_arr
--- a/TTS/tts/layers/bark/load_model.py
+++ b/TTS/tts/layers/bark/load_model.py
@ -0,0 +1,160 @@
+import contextlib
+import functools
+import hashlib
+import logging
+import os
+
+import requests
+import torch
+import tqdm
+
+from TTS.tts.layers.bark.model import GPT, GPTConfig
+from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
+
+if (
+    torch.cuda.is_available()
+    and hasattr(torch.cuda, "amp")
+    and hasattr(torch.cuda.amp, "autocast")
+    and torch.cuda.is_bf16_supported()
+):
+    autocast = functools.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
+else:
+
+    @contextlib.contextmanager
+    def autocast():
+        yield
+
+
+# hold models in global scope to lazy load
+
+logger = logging.getLogger(__name__)
+
+
+if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
+    logger.warning(
+        "torch version does not support flash attention. You will get significantly faster"
+        + " inference speed by upgrade torch to newest version / nightly."
+    )
+
+
+def _md5(fname):
+    hash_md5 = hashlib.md5()
+    with open(fname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+
+def _download(from_s3_path, to_local_path, CACHE_DIR):
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    response = requests.get(from_s3_path, stream=True)
+    total_size_in_bytes = int(response.headers.get("content-length", 0))
+    block_size = 1024  # 1 Kibibyte
+    progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+    with open(to_local_path, "wb") as file:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            file.write(data)
+    progress_bar.close()
+    if total_size_in_bytes not in [0, progress_bar.n]:
+        raise ValueError("ERROR, something went wrong")
+
+
+class InferenceContext:
+    def __init__(self, benchmark=False):
+        # we can't expect inputs to be the same length, so disable benchmarking by default
+        self._chosen_cudnn_benchmark = benchmark
+        self._cudnn_benchmark = None
+
+    def __enter__(self):
+        self._cudnn_benchmark = torch.backends.cudnn.benchmark
+        torch.backends.cudnn.benchmark = self._chosen_cudnn_benchmark
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        torch.backends.cudnn.benchmark = self._cudnn_benchmark
+
+
+if torch.cuda.is_available():
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+
+@contextlib.contextmanager
+def inference_mode():
+    with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
+        yield
+
+
+def clear_cuda_cache():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+def load_model(ckpt_path, device, config, model_type="text"):
+    logger.info(f"loading {model_type} model from {ckpt_path}...")
+
+    if device == "cpu":
+        logger.warning("No GPU being used. Careful, Inference might be extremely slow!")
+    if model_type == "text":
+        ConfigClass = GPTConfig
+        ModelClass = GPT
+    elif model_type == "coarse":
+        ConfigClass = GPTConfig
+        ModelClass = GPT
+    elif model_type == "fine":
+        ConfigClass = FineGPTConfig
+        ModelClass = FineGPT
+    else:
+        raise NotImplementedError()
+    if (
+        not config.USE_SMALLER_MODELS
+        and os.path.exists(ckpt_path)
+        and _md5(ckpt_path) != config.REMOTE_MODEL_PATHS[model_type]["checksum"]
+    ):
+        logger.warning(f"found outdated {model_type} model, removing...")
+        os.remove(ckpt_path)
+    if not os.path.exists(ckpt_path):
+        logger.info(f"{model_type} model not found, downloading...")
+        _download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR)
+
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    # this is a hack
+    model_args = checkpoint["model_args"]
+    if "input_vocab_size" not in model_args:
+        model_args["input_vocab_size"] = model_args["vocab_size"]
+        model_args["output_vocab_size"] = model_args["vocab_size"]
+        del model_args["vocab_size"]
+
+    gptconf = ConfigClass(**checkpoint["model_args"])
+    if model_type == "text":
+        config.semantic_config = gptconf
+    elif model_type == "coarse":
+        config.coarse_config = gptconf
+    elif model_type == "fine":
+        config.fine_config = gptconf
+
+    model = ModelClass(gptconf)
+    state_dict = checkpoint["model"]
+    # fixup checkpoint
+    unwanted_prefix = "_orig_mod."
+    for k, _ in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
+    extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
+    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+    missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
+    if len(extra_keys) != 0:
+        raise ValueError(f"extra keys found: {extra_keys}")
+    if len(missing_keys) != 0:
+        raise ValueError(f"missing keys: {missing_keys}")
+    model.load_state_dict(state_dict, strict=False)
+    n_params = model.get_num_params()
+    val_loss = checkpoint["best_val_loss"].item()
+    logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
+    model.eval()
+    model.to(device)
+    del checkpoint, state_dict
+    clear_cuda_cache()
+    return model, config
--- a/TTS/tts/layers/bark/model.py
+++ b/TTS/tts/layers/bark/model.py
@ -0,0 +1,233 @@
+"""
+Much of this code is adapted from Andrej Karpathy's NanoGPT
+(https://github.com/karpathy/nanoGPT)
+"""
+import math
+from dataclasses import dataclass
+
+import torch
+from coqpit import Coqpit
+from torch import nn
+from torch.nn import functional as F
+
+
+class LayerNorm(nn.Module):
+    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
+
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+
+    def forward(self, x):
+        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
+
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
+        if not self.flash:
+            # print("WARNING: using slow attention. Flash Attention atm needs PyTorch nightly and dropout=0.0")
+            # causal mask to ensure that attention is only applied to the left in the input sequence
+            self.register_buffer(
+                "bias",
+                torch.tril(torch.ones(config.block_size, config.block_size)).view(
+                    1, 1, config.block_size, config.block_size
+                ),
+            )
+
+    def forward(self, x, past_kv=None, use_cache=False):
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+
+        if past_kv is not None:
+            past_key = past_kv[0]
+            past_value = past_kv[1]
+            k = torch.cat((past_key, k), dim=-2)
+            v = torch.cat((past_value, v), dim=-2)
+
+        FULL_T = k.shape[-2]
+
+        if use_cache is True:
+            present = (k, v)
+        else:
+            present = None
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash:
+            # efficient attention using Flash Attention CUDA kernels
+            if past_kv is not None:
+                # When `past_kv` is provided, we're doing incremental decoding and `q.shape[2] == 1`: q only contains
+                # the query for the last token. scaled_dot_product_attention interprets this as the first token in the
+                # sequence, so if is_causal=True it will mask out all attention from it. This is not what we want, so
+                # to work around this we set is_causal=False.
+                is_causal = False
+            else:
+                is_causal = True
+
+            # efficient attention using Flash Attention CUDA kernels
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout, is_causal=is_causal)
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:, :, FULL_T - T : FULL_T, :FULL_T] == 0, float("-inf"))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
+
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return (y, present)
+
+
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+        self.gelu = nn.GELU()
+
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+        self.layer_idx = layer_idx
+
+    def forward(self, x, past_kv=None, use_cache=False):
+        attn_output, prev_kvs = self.attn(self.ln_1(x), past_kv=past_kv, use_cache=use_cache)
+        x = x + attn_output
+        x = x + self.mlp(self.ln_2(x))
+        return (x, prev_kvs)
+
+
+@dataclass
+class GPTConfig(Coqpit):
+    block_size: int = 1024
+    input_vocab_size: int = 10_048
+    output_vocab_size: int = 10_048
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    dropout: float = 0.0
+    bias: bool = True  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+
+
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.input_vocab_size is not None
+        assert config.output_vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.input_vocab_size, config.n_embd),
+                wpe=nn.Embedding(config.block_size, config.n_embd),
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
+                ln_f=LayerNorm(config.n_embd, bias=config.bias),
+            )
+        )
+        self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
+
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wte.weight.numel()
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+
+    def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
+        device = idx.device
+        _, t = idx.size()
+        if past_kv is not None:
+            assert t == 1
+            tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        else:
+            if merge_context:
+                assert idx.shape[1] >= 256 + 256 + 1
+                t = idx.shape[1] - 256
+            else:
+                assert (
+                    t <= self.config.block_size
+                ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+
+            # forward the GPT model itself
+            if merge_context:
+                tok_emb = torch.cat(
+                    [
+                        self.transformer.wte(idx[:, :256]) + self.transformer.wte(idx[:, 256 : 256 + 256]),
+                        self.transformer.wte(idx[:, 256 + 256 :]),
+                    ],
+                    dim=1,
+                )
+            else:
+                tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+
+        if past_kv is None:
+            past_length = 0
+            past_kv = tuple([None] * len(self.transformer.h))
+        else:
+            past_length = past_kv[0][0].size(-2)
+
+        if position_ids is None:
+            position_ids = torch.arange(past_length, t + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)  # shape (1, t)
+            assert position_ids.shape == (1, t)
+
+        pos_emb = self.transformer.wpe(position_ids)  # position embeddings of shape (1, t, n_embd)
+
+        x = self.transformer.drop(tok_emb + pos_emb)
+
+        new_kv = () if use_cache else None
+
+        for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
+            x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
+
+            if use_cache:
+                new_kv = new_kv + (kv,)
+
+        x = self.transformer.ln_f(x)
+
+        # inference-time mini-optimization: only forward the lm_head on the very last position
+        logits = self.lm_head(x[:, [-1], :])  # note: using list [-1] to preserve the time dim
+
+        return (logits, new_kv)
--- a/TTS/tts/layers/bark/model_fine.py
+++ b/TTS/tts/layers/bark/model_fine.py
@ -0,0 +1,142 @@
+"""
+Much of this code is adapted from Andrej Karpathy's NanoGPT
+(https://github.com/karpathy/nanoGPT)
+"""
+import math
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .model import GPT, MLP, GPTConfig
+
+
+class NonCausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention") and self.dropout == 0.0
+
+    def forward(self, x):
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash:
+            # efficient attention using Flash Attention CUDA kernels
+            y = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=False
+            )
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
+
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+
+
+class FineBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd)
+        self.attn = NonCausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd)
+        self.mlp = MLP(config)
+
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class FineGPT(GPT):
+    def __init__(self, config):
+        super().__init__(config)
+        del self.lm_head
+        self.config = config
+        self.n_codes_total = config.n_codes_total
+        self.transformer = nn.ModuleDict(
+            dict(
+                wtes=nn.ModuleList(
+                    [nn.Embedding(config.input_vocab_size, config.n_embd) for _ in range(config.n_codes_total)]
+                ),
+                wpe=nn.Embedding(config.block_size, config.n_embd),
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList([FineBlock(config) for _ in range(config.n_layer)]),
+                ln_f=nn.LayerNorm(config.n_embd),
+            )
+        )
+        self.lm_heads = nn.ModuleList(
+            [
+                nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
+                for _ in range(config.n_codes_given, self.n_codes_total)
+            ]
+        )
+        for i in range(self.n_codes_total - config.n_codes_given):
+            self.transformer.wtes[i + 1].weight = self.lm_heads[i].weight
+
+    def forward(self, pred_idx, idx):
+        device = idx.device
+        b, t, codes = idx.size()
+        assert (
+            t <= self.config.block_size
+        ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        assert pred_idx > 0, "cannot predict 0th codebook"
+        assert codes == self.n_codes_total, (b, t, codes)
+        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)  # shape (1, t)
+
+        # forward the GPT model itself
+        tok_embs = [
+            wte(idx[:, :, i]).unsqueeze(-1) for i, wte in enumerate(self.transformer.wtes)
+        ]  # token embeddings of shape (b, t, n_embd)
+        tok_emb = torch.cat(tok_embs, dim=-1)
+        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (1, t, n_embd)
+        x = tok_emb[:, :, :, : pred_idx + 1].sum(dim=-1)
+        x = self.transformer.drop(x + pos_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        logits = self.lm_heads[pred_idx - self.config.n_codes_given](x)
+        return logits
+
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            for wte in self.transformer.wtes:
+                n_params -= wte.weight.numel()
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+
+
+@dataclass
+class FineGPTConfig(GPTConfig):
+    n_codes_total: int = 8
+    n_codes_given: int = 1
--- a/TTS/tts/layers/delightful_tts/init.py
+++ b/TTS/tts/layers/delightful_tts/init.py
--- a/TTS/tts/layers/delightful_tts/acoustic_model.py
+++ b/TTS/tts/layers/delightful_tts/acoustic_model.py
@ -0,0 +1,563 @@
+### credit: https://github.com/dunky11/voicesmith
+from typing import Callable, Dict, Tuple
+
+import torch
+import torch.nn.functional as F
+from coqpit import Coqpit
+from torch import nn
+
+from TTS.tts.layers.delightful_tts.conformer import Conformer
+from TTS.tts.layers.delightful_tts.encoders import (
+    PhonemeLevelProsodyEncoder,
+    UtteranceLevelProsodyEncoder,
+    get_mask_from_lengths,
+)
+from TTS.tts.layers.delightful_tts.energy_adaptor import EnergyAdaptor
+from TTS.tts.layers.delightful_tts.networks import EmbeddingPadded, positional_encoding
+from TTS.tts.layers.delightful_tts.phoneme_prosody_predictor import PhonemeProsodyPredictor
+from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor
+from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
+from TTS.tts.layers.generic.aligner import AlignmentNetwork
+from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
+
+
+class AcousticModel(torch.nn.Module):
+    def __init__(
+        self,
+        args: "ModelArgs",
+        tokenizer: "TTSTokenizer" = None,
+        speaker_manager: "SpeakerManager" = None,
+    ):
+        super().__init__()
+        self.args = args
+        self.tokenizer = tokenizer
+        self.speaker_manager = speaker_manager
+
+        self.init_multispeaker(args)
+        # self.set_embedding_dims()
+
+        self.length_scale = (
+            float(self.args.length_scale) if isinstance(self.args.length_scale, int) else self.args.length_scale
+        )
+
+        self.emb_dim = args.n_hidden_conformer_encoder
+        self.encoder = Conformer(
+            dim=self.args.n_hidden_conformer_encoder,
+            n_layers=self.args.n_layers_conformer_encoder,
+            n_heads=self.args.n_heads_conformer_encoder,
+            speaker_embedding_dim=self.embedded_speaker_dim,
+            p_dropout=self.args.dropout_conformer_encoder,
+            kernel_size_conv_mod=self.args.kernel_size_conv_mod_conformer_encoder,
+            lrelu_slope=self.args.lrelu_slope,
+        )
+        self.pitch_adaptor = PitchAdaptor(
+            n_input=self.args.n_hidden_conformer_encoder,
+            n_hidden=self.args.n_hidden_variance_adaptor,
+            n_out=1,
+            kernel_size=self.args.kernel_size_variance_adaptor,
+            emb_kernel_size=self.args.emb_kernel_size_variance_adaptor,
+            p_dropout=self.args.dropout_variance_adaptor,
+            lrelu_slope=self.args.lrelu_slope,
+        )
+        self.energy_adaptor = EnergyAdaptor(
+            channels_in=self.args.n_hidden_conformer_encoder,
+            channels_hidden=self.args.n_hidden_variance_adaptor,
+            channels_out=1,
+            kernel_size=self.args.kernel_size_variance_adaptor,
+            emb_kernel_size=self.args.emb_kernel_size_variance_adaptor,
+            dropout=self.args.dropout_variance_adaptor,
+            lrelu_slope=self.args.lrelu_slope,
+        )
+
+        self.aligner = AlignmentNetwork(
+            in_query_channels=self.args.out_channels,
+            in_key_channels=self.args.n_hidden_conformer_encoder,
+        )
+
+        self.duration_predictor = VariancePredictor(
+            channels_in=self.args.n_hidden_conformer_encoder,
+            channels=self.args.n_hidden_variance_adaptor,
+            channels_out=1,
+            kernel_size=self.args.kernel_size_variance_adaptor,
+            p_dropout=self.args.dropout_variance_adaptor,
+            lrelu_slope=self.args.lrelu_slope,
+        )
+
+        self.utterance_prosody_encoder = UtteranceLevelProsodyEncoder(
+            num_mels=self.args.num_mels,
+            ref_enc_filters=self.args.ref_enc_filters_reference_encoder,
+            ref_enc_size=self.args.ref_enc_size_reference_encoder,
+            ref_enc_gru_size=self.args.ref_enc_gru_size_reference_encoder,
+            ref_enc_strides=self.args.ref_enc_strides_reference_encoder,
+            n_hidden=self.args.n_hidden_conformer_encoder,
+            dropout=self.args.dropout_conformer_encoder,
+            bottleneck_size_u=self.args.bottleneck_size_u_reference_encoder,
+            token_num=self.args.token_num_reference_encoder,
+        )
+
+        self.utterance_prosody_predictor = PhonemeProsodyPredictor(
+            hidden_size=self.args.n_hidden_conformer_encoder,
+            kernel_size=self.args.predictor_kernel_size_reference_encoder,
+            dropout=self.args.dropout_conformer_encoder,
+            bottleneck_size=self.args.bottleneck_size_u_reference_encoder,
+            lrelu_slope=self.args.lrelu_slope,
+        )
+
+        self.phoneme_prosody_encoder = PhonemeLevelProsodyEncoder(
+            num_mels=self.args.num_mels,
+            ref_enc_filters=self.args.ref_enc_filters_reference_encoder,
+            ref_enc_size=self.args.ref_enc_size_reference_encoder,
+            ref_enc_gru_size=self.args.ref_enc_gru_size_reference_encoder,
+            ref_enc_strides=self.args.ref_enc_strides_reference_encoder,
+            n_hidden=self.args.n_hidden_conformer_encoder,
+            dropout=self.args.dropout_conformer_encoder,
+            bottleneck_size_p=self.args.bottleneck_size_p_reference_encoder,
+            n_heads=self.args.n_heads_conformer_encoder,
+        )
+
+        self.phoneme_prosody_predictor = PhonemeProsodyPredictor(
+            hidden_size=self.args.n_hidden_conformer_encoder,
+            kernel_size=self.args.predictor_kernel_size_reference_encoder,
+            dropout=self.args.dropout_conformer_encoder,
+            bottleneck_size=self.args.bottleneck_size_p_reference_encoder,
+            lrelu_slope=self.args.lrelu_slope,
+        )
+
+        self.u_bottle_out = nn.Linear(
+            self.args.bottleneck_size_u_reference_encoder,
+            self.args.n_hidden_conformer_encoder,
+        )
+
+        self.u_norm = nn.InstanceNorm1d(self.args.bottleneck_size_u_reference_encoder)
+        self.p_bottle_out = nn.Linear(
+            self.args.bottleneck_size_p_reference_encoder,
+            self.args.n_hidden_conformer_encoder,
+        )
+        self.p_norm = nn.InstanceNorm1d(
+            self.args.bottleneck_size_p_reference_encoder,
+        )
+        self.decoder = Conformer(
+            dim=self.args.n_hidden_conformer_decoder,
+            n_layers=self.args.n_layers_conformer_decoder,
+            n_heads=self.args.n_heads_conformer_decoder,
+            speaker_embedding_dim=self.embedded_speaker_dim,
+            p_dropout=self.args.dropout_conformer_decoder,
+            kernel_size_conv_mod=self.args.kernel_size_conv_mod_conformer_decoder,
+            lrelu_slope=self.args.lrelu_slope,
+        )
+
+        padding_idx = self.tokenizer.characters.pad_id
+        self.src_word_emb = EmbeddingPadded(
+            self.args.num_chars, self.args.n_hidden_conformer_encoder, padding_idx=padding_idx
+        )
+        self.to_mel = nn.Linear(
+            self.args.n_hidden_conformer_decoder,
+            self.args.num_mels,
+        )
+
+        self.energy_scaler = torch.nn.BatchNorm1d(1, affine=False, track_running_stats=True, momentum=None)
+        self.energy_scaler.requires_grad_(False)
+
+    def init_multispeaker(self, args: Coqpit):  # pylint: disable=unused-argument
+        """Init for multi-speaker training."""
+        self.embedded_speaker_dim = 0
+        self.num_speakers = self.args.num_speakers
+        self.audio_transform = None
+
+        if self.speaker_manager:
+            self.num_speakers = self.speaker_manager.num_speakers
+
+        if self.args.use_speaker_embedding:
+            self._init_speaker_embedding()
+
+        if self.args.use_d_vector_file:
+            self._init_d_vector()
+
+    @staticmethod
+    def _set_cond_input(aux_input: Dict):
+        """Set the speaker conditioning input based on the multi-speaker mode."""
+        sid, g, lid, durations = None, None, None, None
+        if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
+            sid = aux_input["speaker_ids"]
+            if sid.ndim == 0:
+                sid = sid.unsqueeze_(0)
+        if "d_vectors" in aux_input and aux_input["d_vectors"] is not None:
+            g = F.normalize(aux_input["d_vectors"])  # .unsqueeze_(-1)
+            if g.ndim == 2:
+                g = g  #  .unsqueeze_(0) # pylint: disable=self-assigning-variable
+
+        if "durations" in aux_input and aux_input["durations"] is not None:
+            durations = aux_input["durations"]
+
+        return sid, g, lid, durations
+
+    def get_aux_input(self, aux_input: Dict):
+        sid, g, lid, _ = self._set_cond_input(aux_input)
+        return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
+
+    def _set_speaker_input(self, aux_input: Dict):
+        d_vectors = aux_input.get("d_vectors", None)
+        speaker_ids = aux_input.get("speaker_ids", None)
+
+        if d_vectors is not None and speaker_ids is not None:
+            raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")
+
+        if speaker_ids is not None and not hasattr(self, "emb_g"):
+            raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")
+
+        g = speaker_ids if speaker_ids is not None else d_vectors
+        return g
+
+    # def set_embedding_dims(self):
+    #     if self.embedded_speaker_dim > 0:
+    #         self.embedding_dims = self.embedded_speaker_dim
+    #     else:
+    #         self.embedding_dims = 0
+
+    def _init_speaker_embedding(self):
+        # pylint: disable=attribute-defined-outside-init
+        if self.num_speakers > 0:
+            print(" > initialization of speaker-embedding layers.")
+            self.embedded_speaker_dim = self.args.speaker_embedding_channels
+            self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
+
+    def _init_d_vector(self):
+        # pylint: disable=attribute-defined-outside-init
+        if hasattr(self, "emb_g"):
+            raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.")
+        self.embedded_speaker_dim = self.args.d_vector_dim
+
+    @staticmethod
+    def generate_attn(dr, x_mask, y_mask=None):
+        """Generate an attention mask from the linear scale durations.
+
+        Args:
+            dr (Tensor): Linear scale durations.
+            x_mask (Tensor): Mask for the input (character) sequence.
+            y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations
+                if None. Defaults to None.
+
+        Shapes
+           - dr: :math:`(B, T_{en})`
+           - x_mask: :math:`(B, T_{en})`
+           - y_mask: :math:`(B, T_{de})`
+        """
+        # compute decode mask from the durations
+        if y_mask is None:
+            y_lengths = dr.sum(1).long()
+            y_lengths[y_lengths < 1] = 1
+            y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
+        attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
+        attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
+        return attn
+
+    def _expand_encoder_with_durations(
+        self,
+        o_en: torch.FloatTensor,
+        dr: torch.IntTensor,
+        x_mask: torch.IntTensor,
+        y_lengths: torch.IntTensor,
+    ):
+        y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype)
+        attn = self.generate_attn(dr, x_mask, y_mask)
+        o_en_ex = torch.einsum("kmn, kjm -> kjn", [attn.float(), o_en])
+        return y_mask, o_en_ex, attn.transpose(1, 2)
+
+    def _forward_aligner(
+        self,
+        x: torch.FloatTensor,
+        y: torch.FloatTensor,
+        x_mask: torch.IntTensor,
+        y_mask: torch.IntTensor,
+        attn_priors: torch.FloatTensor,
+    ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Aligner forward pass.
+
+        1. Compute a mask to apply to the attention map.
+        2. Run the alignment network.
+        3. Apply MAS to compute the hard alignment map.
+        4. Compute the durations from the hard alignment map.
+
+        Args:
+            x (torch.FloatTensor): Input sequence.
+            y (torch.FloatTensor): Output sequence.
+            x_mask (torch.IntTensor): Input sequence mask.
+            y_mask (torch.IntTensor): Output sequence mask.
+            attn_priors (torch.FloatTensor): Prior for the aligner network map.
+
+        Returns:
+            Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+                Durations from the hard alignment map, soft alignment potentials, log scale alignment potentials,
+                hard alignment map.
+
+        Shapes:
+            - x: :math:`[B, T_en, C_en]`
+            - y: :math:`[B, T_de, C_de]`
+            - x_mask: :math:`[B, 1, T_en]`
+            - y_mask: :math:`[B, 1, T_de]`
+            - attn_priors: :math:`[B, T_de, T_en]`
+
+            - aligner_durations: :math:`[B, T_en]`
+            - aligner_soft: :math:`[B, T_de, T_en]`
+            - aligner_logprob: :math:`[B, 1, T_de, T_en]`
+            - aligner_mas: :math:`[B, T_de, T_en]`
+        """
+        attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)  # [B, 1, T_en, T_de]
+        aligner_soft, aligner_logprob = self.aligner(y.transpose(1, 2), x.transpose(1, 2), x_mask, attn_priors)
+        aligner_mas = maximum_path(
+            aligner_soft.squeeze(1).transpose(1, 2).contiguous(), attn_mask.squeeze(1).contiguous()
+        )
+        aligner_durations = torch.sum(aligner_mas, -1).int()
+        aligner_soft = aligner_soft.squeeze(1)  # [B, T_max2, T_max]
+        aligner_mas = aligner_mas.transpose(1, 2)  # [B, T_max, T_max2] -> [B, T_max2, T_max]
+        return aligner_durations, aligner_soft, aligner_logprob, aligner_mas
+
+    def average_utterance_prosody(  # pylint: disable=no-self-use
+        self, u_prosody_pred: torch.Tensor, src_mask: torch.Tensor
+    ) -> torch.Tensor:
+        lengths = ((~src_mask) * 1.0).sum(1)
+        u_prosody_pred = u_prosody_pred.sum(1, keepdim=True) / lengths.view(-1, 1, 1)
+        return u_prosody_pred
+
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        src_lens: torch.Tensor,
+        mels: torch.Tensor,
+        mel_lens: torch.Tensor,
+        pitches: torch.Tensor,
+        energies: torch.Tensor,
+        attn_priors: torch.Tensor,
+        use_ground_truth: bool = True,
+        d_vectors: torch.Tensor = None,
+        speaker_idx: torch.Tensor = None,
+    ) -> Dict[str, torch.Tensor]:
+        sid, g, lid, _ = self._set_cond_input(  # pylint: disable=unused-variable
+            {"d_vectors": d_vectors, "speaker_ids": speaker_idx}
+        )  # pylint: disable=unused-variable
+
+        src_mask = get_mask_from_lengths(src_lens)  # [B, T_src]
+        mel_mask = get_mask_from_lengths(mel_lens)  # [B, T_mel]
+
+        # Token embeddings
+        token_embeddings = self.src_word_emb(tokens)  # [B, T_src, C_hidden]
+        token_embeddings = token_embeddings.masked_fill(src_mask.unsqueeze(-1), 0.0)
+
+        # Alignment network and durations
+        aligner_durations, aligner_soft, aligner_logprob, aligner_mas = self._forward_aligner(
+            x=token_embeddings,
+            y=mels.transpose(1, 2),
+            x_mask=~src_mask[:, None],
+            y_mask=~mel_mask[:, None],
+            attn_priors=attn_priors,
+        )
+        dr = aligner_durations  # [B, T_en]
+
+        # Embeddings
+        speaker_embedding = None
+        if d_vectors is not None:
+            speaker_embedding = g
+        elif speaker_idx is not None:
+            speaker_embedding = F.normalize(self.emb_g(sid))
+
+        pos_encoding = positional_encoding(
+            self.emb_dim,
+            max(token_embeddings.shape[1], max(mel_lens)),
+            device=token_embeddings.device,
+        )
+        encoder_outputs = self.encoder(
+            token_embeddings,
+            src_mask,
+            speaker_embedding=speaker_embedding,
+            encoding=pos_encoding,
+        )
+
+        u_prosody_ref = self.u_norm(self.utterance_prosody_encoder(mels=mels, mel_lens=mel_lens))
+        u_prosody_pred = self.u_norm(
+            self.average_utterance_prosody(
+                u_prosody_pred=self.utterance_prosody_predictor(x=encoder_outputs, mask=src_mask),
+                src_mask=src_mask,
+            )
+        )
+
+        if use_ground_truth:
+            encoder_outputs = encoder_outputs + self.u_bottle_out(u_prosody_ref)
+        else:
+            encoder_outputs = encoder_outputs + self.u_bottle_out(u_prosody_pred)
+
+        p_prosody_ref = self.p_norm(
+            self.phoneme_prosody_encoder(
+                x=encoder_outputs, src_mask=src_mask, mels=mels, mel_lens=mel_lens, encoding=pos_encoding
+            )
+        )
+        p_prosody_pred = self.p_norm(self.phoneme_prosody_predictor(x=encoder_outputs, mask=src_mask))
+
+        if use_ground_truth:
+            encoder_outputs = encoder_outputs + self.p_bottle_out(p_prosody_ref)
+        else:
+            encoder_outputs = encoder_outputs + self.p_bottle_out(p_prosody_pred)
+
+        encoder_outputs_res = encoder_outputs
+
+        pitch_pred, avg_pitch_target, pitch_emb = self.pitch_adaptor.get_pitch_embedding_train(
+            x=encoder_outputs,
+            target=pitches,
+            dr=dr,
+            mask=src_mask,
+        )
+
+        energy_pred, avg_energy_target, energy_emb = self.energy_adaptor.get_energy_embedding_train(
+            x=encoder_outputs,
+            target=energies,
+            dr=dr,
+            mask=src_mask,
+        )
+
+        encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb + energy_emb
+        log_duration_prediction = self.duration_predictor(x=encoder_outputs_res.detach(), mask=src_mask)
+
+        mel_pred_mask, encoder_outputs_ex, alignments = self._expand_encoder_with_durations(
+            o_en=encoder_outputs, y_lengths=mel_lens, dr=dr, x_mask=~src_mask[:, None]
+        )
+
+        x = self.decoder(
+            encoder_outputs_ex.transpose(1, 2),
+            mel_mask,
+            speaker_embedding=speaker_embedding,
+            encoding=pos_encoding,
+        )
+        x = self.to_mel(x)
+
+        dr = torch.log(dr + 1)
+
+        dr_pred = torch.exp(log_duration_prediction) - 1
+        alignments_dp = self.generate_attn(dr_pred, src_mask.unsqueeze(1), mel_pred_mask)  # [B, T_max, T_max2']
+
+        return {
+            "model_outputs": x,
+            "pitch_pred": pitch_pred,
+            "pitch_target": avg_pitch_target,
+            "energy_pred": energy_pred,
+            "energy_target": avg_energy_target,
+            "u_prosody_pred": u_prosody_pred,
+            "u_prosody_ref": u_prosody_ref,
+            "p_prosody_pred": p_prosody_pred,
+            "p_prosody_ref": p_prosody_ref,
+            "alignments_dp": alignments_dp,
+            "alignments": alignments,  # [B, T_de, T_en]
+            "aligner_soft": aligner_soft,
+            "aligner_mas": aligner_mas,
+            "aligner_durations": aligner_durations,
+            "aligner_logprob": aligner_logprob,
+            "dr_log_pred": log_duration_prediction.squeeze(1),  # [B, T]
+            "dr_log_target": dr.squeeze(1),  # [B, T]
+            "spk_emb": speaker_embedding,
+        }
+
+    @torch.no_grad()
+    def inference(
+        self,
+        tokens: torch.Tensor,
+        speaker_idx: torch.Tensor,
+        p_control: float = None,  # TODO # pylint: disable=unused-argument
+        d_control: float = None,  # TODO # pylint: disable=unused-argument
+        d_vectors: torch.Tensor = None,
+        pitch_transform: Callable = None,
+        energy_transform: Callable = None,
+    ) -> torch.Tensor:
+        src_mask = get_mask_from_lengths(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device))
+        src_lens = torch.tensor(tokens.shape[1:2]).to(tokens.device)  # pylint: disable=unused-variable
+        sid, g, lid, _ = self._set_cond_input(  # pylint: disable=unused-variable
+            {"d_vectors": d_vectors, "speaker_ids": speaker_idx}
+        )  # pylint: disable=unused-variable
+
+        token_embeddings = self.src_word_emb(tokens)
+        token_embeddings = token_embeddings.masked_fill(src_mask.unsqueeze(-1), 0.0)
+
+        # Embeddings
+        speaker_embedding = None
+        if d_vectors is not None:
+            speaker_embedding = g
+        elif speaker_idx is not None:
+            speaker_embedding = F.normalize(self.emb_g(sid))
+
+        pos_encoding = positional_encoding(
+            self.emb_dim,
+            token_embeddings.shape[1],
+            device=token_embeddings.device,
+        )
+        encoder_outputs = self.encoder(
+            token_embeddings,
+            src_mask,
+            speaker_embedding=speaker_embedding,
+            encoding=pos_encoding,
+        )
+
+        u_prosody_pred = self.u_norm(
+            self.average_utterance_prosody(
+                u_prosody_pred=self.utterance_prosody_predictor(x=encoder_outputs, mask=src_mask),
+                src_mask=src_mask,
+            )
+        )
+        encoder_outputs = encoder_outputs + self.u_bottle_out(u_prosody_pred).expand_as(encoder_outputs)
+
+        p_prosody_pred = self.p_norm(
+            self.phoneme_prosody_predictor(
+                x=encoder_outputs,
+                mask=src_mask,
+            )
+        )
+        encoder_outputs = encoder_outputs + self.p_bottle_out(p_prosody_pred).expand_as(encoder_outputs)
+
+        encoder_outputs_res = encoder_outputs
+
+        pitch_emb_pred, pitch_pred = self.pitch_adaptor.get_pitch_embedding(
+            x=encoder_outputs,
+            mask=src_mask,
+            pitch_transform=pitch_transform,
+            pitch_mean=self.pitch_mean if hasattr(self, "pitch_mean") else None,
+            pitch_std=self.pitch_std if hasattr(self, "pitch_std") else None,
+        )
+
+        energy_emb_pred, energy_pred = self.energy_adaptor.get_energy_embedding(
+            x=encoder_outputs, mask=src_mask, energy_transform=energy_transform
+        )
+        encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb_pred + energy_emb_pred
+
+        log_duration_pred = self.duration_predictor(
+            x=encoder_outputs_res.detach(), mask=src_mask
+        )  # [B, C_hidden, T_src] -> [B, T_src]
+        duration_pred = (torch.exp(log_duration_pred) - 1) * (~src_mask) * self.length_scale  # -> [B, T_src]
+        duration_pred[duration_pred < 1] = 1.0  # -> [B, T_src]
+        duration_pred = torch.round(duration_pred)  # -> [B, T_src]
+        mel_lens = duration_pred.sum(1)  # -> [B,]
+
+        _, encoder_outputs_ex, alignments = self._expand_encoder_with_durations(
+            o_en=encoder_outputs, y_lengths=mel_lens, dr=duration_pred.squeeze(1), x_mask=~src_mask[:, None]
+        )
+
+        mel_mask = get_mask_from_lengths(
+            torch.tensor([encoder_outputs_ex.shape[2]], dtype=torch.int64, device=encoder_outputs_ex.device)
+        )
+
+        if encoder_outputs_ex.shape[1] > pos_encoding.shape[1]:
+            encoding = positional_encoding(self.emb_dim, encoder_outputs_ex.shape[2], device=tokens.device)
+
+        # [B, C_hidden, T_src], [B, 1, T_src], [B, C_emb], [B, T_src, C_hidden] -> [B, C_hidden, T_src]
+        x = self.decoder(
+            encoder_outputs_ex.transpose(1, 2),
+            mel_mask,
+            speaker_embedding=speaker_embedding,
+            encoding=encoding,
+        )
+        x = self.to_mel(x)
+        outputs = {
+            "model_outputs": x,
+            "alignments": alignments,
+            # "pitch": pitch_emb_pred,
+            "durations": duration_pred,
+            "pitch": pitch_pred,
+            "energy": energy_pred,
+            "spk_emb": speaker_embedding,
+        }
+        return outputs
--- a/TTS/tts/layers/delightful_tts/conformer.py
+++ b/TTS/tts/layers/delightful_tts/conformer.py
@ -0,0 +1,450 @@
+### credit: https://github.com/dunky11/voicesmith
+import math
+from typing import Tuple
+
+import torch
+import torch.nn as nn  # pylint: disable=consider-using-from-import
+import torch.nn.functional as F
+
+from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d
+from TTS.tts.layers.delightful_tts.networks import GLUActivation
+
+
+def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
+    pad = kernel_size // 2
+    return (pad, pad - (kernel_size + 1) % 2)
+
+
+class Conformer(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_layers: int,
+        n_heads: int,
+        speaker_embedding_dim: int,
+        p_dropout: float,
+        kernel_size_conv_mod: int,
+        lrelu_slope: float,
+    ):
+        """
+        A Transformer variant that integrates both CNNs and Transformers components.
+        Conformer proposes a novel combination of self-attention and convolution, in which self-attention
+        learns the global interaction while the convolutions efficiently capture the local correlations.
+
+        Args:
+            dim (int): Number of the dimensions for the model.
+            n_layers (int): Number of model layers.
+            n_heads (int): The number of attention heads.
+            speaker_embedding_dim (int): Number of speaker embedding dimensions.
+            p_dropout (float): Probabilty of dropout.
+            kernel_size_conv_mod (int): Size of kernels for convolution modules.
+
+        Inputs: inputs, mask
+            - **inputs** (batch, time, dim): Tensor containing input vector
+            - **encoding** (batch, time, dim): Positional embedding tensor
+            - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+        Returns:
+            - **outputs** (batch, time, dim): Tensor produced by Conformer Encoder.
+        """
+        super().__init__()
+        d_k = d_v = dim // n_heads
+        self.layer_stack = nn.ModuleList(
+            [
+                ConformerBlock(
+                    dim,
+                    n_heads,
+                    d_k,
+                    d_v,
+                    kernel_size_conv_mod=kernel_size_conv_mod,
+                    dropout=p_dropout,
+                    speaker_embedding_dim=speaker_embedding_dim,
+                    lrelu_slope=lrelu_slope,
+                )
+                for _ in range(n_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        speaker_embedding: torch.Tensor,
+        encoding: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Shapes:
+            - x: :math:`[B, T_src, C]`
+            - mask: :math: `[B]`
+            - speaker_embedding: :math: `[B, C]`
+            - encoding: :math: `[B, T_max2, C]`
+        """
+
+        attn_mask = mask.view((mask.shape[0], 1, 1, mask.shape[1]))
+        for enc_layer in self.layer_stack:
+            x = enc_layer(
+                x,
+                mask=mask,
+                slf_attn_mask=attn_mask,
+                speaker_embedding=speaker_embedding,
+                encoding=encoding,
+            )
+        return x
+
+
+class ConformerBlock(torch.nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        d_k: int,  # pylint: disable=unused-argument
+        d_v: int,  # pylint: disable=unused-argument
+        kernel_size_conv_mod: int,
+        speaker_embedding_dim: int,
+        dropout: float,
+        lrelu_slope: float = 0.3,
+    ):
+        """
+        A Conformer block is composed of four modules stacked together,
+        A feed-forward module, a self-attention module, a convolution module,
+        and a second feed-forward module in the end. The block starts with two Feed forward
+        modules sandwiching the Multi-Headed Self-Attention module and the Conv module.
+
+        Args:
+            d_model (int): The dimension of model
+            n_head (int): The number of attention heads.
+            kernel_size_conv_mod (int): Size of kernels for convolution modules.
+            speaker_embedding_dim (int): Number of speaker embedding dimensions.
+            emotion_embedding_dim (int): Number of emotion embedding dimensions.
+            dropout (float): Probabilty of dropout.
+
+        Inputs: inputs, mask
+            - **inputs** (batch, time, dim): Tensor containing input vector
+            - **encoding** (batch, time, dim): Positional embedding tensor
+            - **slf_attn_mask** (batch, 1, 1, time1): Tensor containing indices to be masked in self attention module
+            - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+        Returns:
+            - **outputs** (batch, time, dim): Tensor produced by the Conformer Block.
+        """
+        super().__init__()
+        if isinstance(speaker_embedding_dim, int):
+            self.conditioning = Conv1dGLU(
+                d_model=d_model,
+                kernel_size=kernel_size_conv_mod,
+                padding=kernel_size_conv_mod // 2,
+                embedding_dim=speaker_embedding_dim,
+            )
+
+        self.ff = FeedForward(d_model=d_model, dropout=dropout, kernel_size=3, lrelu_slope=lrelu_slope)
+        self.conformer_conv_1 = ConformerConvModule(
+            d_model, kernel_size=kernel_size_conv_mod, dropout=dropout, lrelu_slope=lrelu_slope
+        )
+        self.ln = nn.LayerNorm(d_model)
+        self.slf_attn = ConformerMultiHeadedSelfAttention(d_model=d_model, num_heads=n_head, dropout_p=dropout)
+        self.conformer_conv_2 = ConformerConvModule(
+            d_model, kernel_size=kernel_size_conv_mod, dropout=dropout, lrelu_slope=lrelu_slope
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        speaker_embedding: torch.Tensor,
+        mask: torch.Tensor,
+        slf_attn_mask: torch.Tensor,
+        encoding: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Shapes:
+            - x: :math:`[B, T_src, C]`
+            - mask: :math: `[B]`
+            - slf_attn_mask: :math: `[B, 1, 1, T_src]`
+            - speaker_embedding: :math: `[B, C]`
+            - emotion_embedding: :math: `[B, C]`
+            - encoding: :math: `[B, T_max2, C]`
+        """
+        if speaker_embedding is not None:
+            x = self.conditioning(x, embeddings=speaker_embedding)
+        x = self.ff(x) + x
+        x = self.conformer_conv_1(x) + x
+        res = x
+        x = self.ln(x)
+        x, _ = self.slf_attn(query=x, key=x, value=x, mask=slf_attn_mask, encoding=encoding)
+        x = x + res
+        x = x.masked_fill(mask.unsqueeze(-1), 0)
+
+        x = self.conformer_conv_2(x) + x
+        return x
+
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        kernel_size: int,
+        dropout: float,
+        lrelu_slope: float,
+        expansion_factor: int = 4,
+    ):
+        """
+        Feed Forward module for conformer block.
+
+        Args:
+            d_model (int): The dimension of model.
+            kernel_size (int): Size of the kernels for conv layers.
+            dropout (float): probability of dropout.
+            expansion_factor (int): The factor by which to project the number of channels.
+            lrelu_slope (int): the negative slope factor for the leaky relu activation.
+
+        Inputs: inputs
+            - **inputs** (batch, time, dim): Tensor containing input vector
+        Returns:
+            - **outputs** (batch, time, dim): Tensor produced by the feed forward module.
+        """
+        super().__init__()
+        self.dropout = nn.Dropout(dropout)
+        self.ln = nn.LayerNorm(d_model)
+        self.conv_1 = nn.Conv1d(
+            d_model,
+            d_model * expansion_factor,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+        )
+        self.act = nn.LeakyReLU(lrelu_slope)
+        self.conv_2 = nn.Conv1d(d_model * expansion_factor, d_model, kernel_size=1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Shapes:
+            x: :math: `[B, T, C]`
+        """
+        x = self.ln(x)
+        x = x.permute((0, 2, 1))
+        x = self.conv_1(x)
+        x = x.permute((0, 2, 1))
+        x = self.act(x)
+        x = self.dropout(x)
+        x = x.permute((0, 2, 1))
+        x = self.conv_2(x)
+        x = x.permute((0, 2, 1))
+        x = self.dropout(x)
+        x = 0.5 * x
+        return x
+
+
+class ConformerConvModule(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        expansion_factor: int = 2,
+        kernel_size: int = 7,
+        dropout: float = 0.1,
+        lrelu_slope: float = 0.3,
+    ):
+        """
+        Convolution module for conformer. Starts with a gating machanism.
+        a pointwise convolution and a gated linear unit (GLU). This is followed
+        by a single 1-D depthwise convolution layer. Batchnorm is deployed just after the convolution
+        to help with training. it also contains an expansion factor to project the number of channels.
+
+        Args:
+            d_model (int): The dimension of model.
+            expansion_factor (int): The factor by which to project the number of channels.
+            kernel_size (int): Size of kernels for convolution modules.
+            dropout (float): Probabilty of dropout.
+            lrelu_slope (float): The slope coefficient for leaky relu activation.
+
+        Inputs: inputs
+            - **inputs** (batch, time, dim): Tensor containing input vector
+        Returns:
+            - **outputs** (batch, time, dim): Tensor produced by the conv module.
+
+        """
+        super().__init__()
+        inner_dim = d_model * expansion_factor
+        self.ln_1 = nn.LayerNorm(d_model)
+        self.conv_1 = PointwiseConv1d(d_model, inner_dim * 2)
+        self.conv_act = GLUActivation(slope=lrelu_slope)
+        self.depthwise = DepthWiseConv1d(
+            inner_dim,
+            inner_dim,
+            kernel_size=kernel_size,
+            padding=calc_same_padding(kernel_size)[0],
+        )
+        self.ln_2 = nn.GroupNorm(1, inner_dim)
+        self.activation = nn.LeakyReLU(lrelu_slope)
+        self.conv_2 = PointwiseConv1d(inner_dim, d_model)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Shapes:
+            x: :math: `[B, T, C]`
+        """
+        x = self.ln_1(x)
+        x = x.permute(0, 2, 1)
+        x = self.conv_1(x)
+        x = self.conv_act(x)
+        x = self.depthwise(x)
+        x = self.ln_2(x)
+        x = self.activation(x)
+        x = self.conv_2(x)
+        x = x.permute(0, 2, 1)
+        x = self.dropout(x)
+        return x
+
+
+class ConformerMultiHeadedSelfAttention(nn.Module):
+    """
+    Conformer employ multi-headed self-attention (MHSA) while integrating an important technique from Transformer-XL,
+    the relative sinusoidal positional encoding scheme. The relative positional encoding allows the self-attention
+    module to generalize better on different input length and the resulting encoder is more robust to the variance of
+    the utterance length. Conformer use prenorm residual units with dropout which helps training
+    and regularizing deeper models.
+    Args:
+        d_model (int): The dimension of model
+        num_heads (int): The number of attention heads.
+        dropout_p (float): probability of dropout
+    Inputs: inputs, mask
+        - **inputs** (batch, time, dim): Tensor containing input vector
+        - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+    Returns:
+        - **outputs** (batch, time, dim): Tensor produces by relative multi headed self attention module.
+    """
+
+    def __init__(self, d_model: int, num_heads: int, dropout_p: float):
+        super().__init__()
+        self.attention = RelativeMultiHeadAttention(d_model=d_model, num_heads=num_heads)
+        self.dropout = nn.Dropout(p=dropout_p)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor,
+        encoding: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size, seq_length, _ = key.size()  # pylint: disable=unused-variable
+        encoding = encoding[:, : key.shape[1]]
+        encoding = encoding.repeat(batch_size, 1, 1)
+        outputs, attn = self.attention(query, key, value, pos_embedding=encoding, mask=mask)
+        outputs = self.dropout(outputs)
+        return outputs, attn
+
+
+class RelativeMultiHeadAttention(nn.Module):
+    """
+    Multi-head attention with relative positional encoding.
+    This concept was proposed in the "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
+    Args:
+        d_model (int): The dimension of model
+        num_heads (int): The number of attention heads.
+    Inputs: query, key, value, pos_embedding, mask
+        - **query** (batch, time, dim): Tensor containing query vector
+        - **key** (batch, time, dim): Tensor containing key vector
+        - **value** (batch, time, dim): Tensor containing value vector
+        - **pos_embedding** (batch, time, dim): Positional embedding tensor
+        - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+    Returns:
+        - **outputs**: Tensor produces by relative multi head attention module.
+    """
+
+    def __init__(
+        self,
+        d_model: int = 512,
+        num_heads: int = 16,
+    ):
+        super().__init__()
+        assert d_model % num_heads == 0, "d_model % num_heads should be zero."
+        self.d_model = d_model
+        self.d_head = int(d_model / num_heads)
+        self.num_heads = num_heads
+        self.sqrt_dim = math.sqrt(d_model)
+
+        self.query_proj = nn.Linear(d_model, d_model)
+        self.key_proj = nn.Linear(d_model, d_model, bias=False)
+        self.value_proj = nn.Linear(d_model, d_model, bias=False)
+        self.pos_proj = nn.Linear(d_model, d_model, bias=False)
+
+        self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
+        self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
+        torch.nn.init.xavier_uniform_(self.u_bias)
+        torch.nn.init.xavier_uniform_(self.v_bias)
+        self.out_proj = nn.Linear(d_model, d_model)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        pos_embedding: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = query.shape[0]
+        query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
+        key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
+        value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
+        pos_embedding = self.pos_proj(pos_embedding).view(batch_size, -1, self.num_heads, self.d_head)
+        u_bias = self.u_bias.expand_as(query)
+        v_bias = self.v_bias.expand_as(query)
+        a = (query + u_bias).transpose(1, 2)
+        content_score = a @ key.transpose(2, 3)
+        b = (query + v_bias).transpose(1, 2)
+        pos_score = b @ pos_embedding.permute(0, 2, 3, 1)
+        pos_score = self._relative_shift(pos_score)
+
+        score = content_score + pos_score
+        score = score * (1.0 / self.sqrt_dim)
+
+        score.masked_fill_(mask, -1e9)
+
+        attn = F.softmax(score, -1)
+
+        context = (attn @ value).transpose(1, 2)
+        context = context.contiguous().view(batch_size, -1, self.d_model)
+
+        return self.out_proj(context), attn
+
+    def _relative_shift(self, pos_score: torch.Tensor) -> torch.Tensor:  # pylint: disable=no-self-use
+        batch_size, num_heads, seq_length1, seq_length2 = pos_score.size()
+        zeros = torch.zeros((batch_size, num_heads, seq_length1, 1), device=pos_score.device)
+        padded_pos_score = torch.cat([zeros, pos_score], dim=-1)
+        padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
+        pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
+        return pos_score
+
+
+class MultiHeadAttention(nn.Module):
+    """
+    input:
+        query --- [N, T_q, query_dim]
+        key --- [N, T_k, key_dim]
+    output:
+        out --- [N, T_q, num_units]
+    """
+
+    def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
+        super().__init__()
+        self.num_units = num_units
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+
+        self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
+        self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+        self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+
+    def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor:
+        querys = self.W_query(query)  # [N, T_q, num_units]
+        keys = self.W_key(key)  # [N, T_k, num_units]
+        values = self.W_value(key)
+        split_size = self.num_units // self.num_heads
+        querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)  # [h, N, T_q, num_units/h]
+        keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
+        values = torch.stack(torch.split(values, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
+        # score = softmax(QK^T / (d_k ** 0.5))
+        scores = torch.matmul(querys, keys.transpose(2, 3))  # [h, N, T_q, T_k]
+        scores = scores / (self.key_dim**0.5)
+        scores = F.softmax(scores, dim=3)
+        # out = score * V
+        out = torch.matmul(scores, values)  # [h, N, T_q, num_units/h]
+        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)  # [N, T_q, num_units]
+        return out
--- a/TTS/tts/layers/delightful_tts/conv_layers.py
+++ b/TTS/tts/layers/delightful_tts/conv_layers.py
@ -0,0 +1,671 @@
+from typing import Tuple
+
+import torch
+import torch.nn as nn  # pylint: disable=consider-using-from-import
+import torch.nn.functional as F
+from torch.nn.utils import parametrize
+
+from TTS.tts.layers.delightful_tts.kernel_predictor import KernelPredictor
+
+
+def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
+    pad = kernel_size // 2
+    return (pad, pad - (kernel_size + 1) % 2)
+
+
+class ConvNorm(nn.Module):
+    """A 1-dimensional convolutional layer with optional weight normalization.
+
+    This layer wraps a 1D convolutional layer from PyTorch and applies
+    optional weight normalization. The layer can be used in a similar way to
+    the convolutional layers in PyTorch's `torch.nn` module.
+
+    Args:
+        in_channels (int): The number of channels in the input signal.
+        out_channels (int): The number of channels in the output signal.
+        kernel_size (int, optional): The size of the convolving kernel.
+            Defaults to 1.
+        stride (int, optional): The stride of the convolution. Defaults to 1.
+        padding (int, optional): Zero-padding added to both sides of the input.
+            If `None`, the padding will be calculated so that the output has
+            the same length as the input. Defaults to `None`.
+        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+        bias (bool, optional): If `True`, add bias after convolution. Defaults to `True`.
+        w_init_gain (str, optional): The weight initialization function to use.
+            Can be either 'linear' or 'relu'. Defaults to 'linear'.
+        use_weight_norm (bool, optional): If `True`, apply weight normalization
+            to the convolutional weights. Defaults to `False`.
+
+    Shapes:
+     - Input: :math:`[N, D, T]`
+
+    - Output: :math:`[N, out_dim, T]` where `out_dim` is the number of output dimensions.
+
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=None,
+        dilation=1,
+        bias=True,
+        w_init_gain="linear",
+        use_weight_norm=False,
+    ):
+        super(ConvNorm, self).__init__()  # pylint: disable=super-with-arguments
+        if padding is None:
+            assert kernel_size % 2 == 1
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.use_weight_norm = use_weight_norm
+        conv_fn = nn.Conv1d
+        self.conv = conv_fn(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+        nn.init.xavier_uniform_(self.conv.weight, gain=nn.init.calculate_gain(w_init_gain))
+        if self.use_weight_norm:
+            self.conv = nn.utils.parametrizations.weight_norm(self.conv)
+
+    def forward(self, signal, mask=None):
+        conv_signal = self.conv(signal)
+        if mask is not None:
+            # always re-zero output if mask is
+            # available to match zero-padding
+            conv_signal = conv_signal * mask
+        return conv_signal
+
+
+class ConvLSTMLinear(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        n_layers=2,
+        n_channels=256,
+        kernel_size=3,
+        p_dropout=0.1,
+        lstm_type="bilstm",
+        use_linear=True,
+    ):
+        super(ConvLSTMLinear, self).__init__()  # pylint: disable=super-with-arguments
+        self.out_dim = out_dim
+        self.lstm_type = lstm_type
+        self.use_linear = use_linear
+        self.dropout = nn.Dropout(p=p_dropout)
+
+        convolutions = []
+        for i in range(n_layers):
+            conv_layer = ConvNorm(
+                in_dim if i == 0 else n_channels,
+                n_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=int((kernel_size - 1) / 2),
+                dilation=1,
+                w_init_gain="relu",
+            )
+            conv_layer = nn.utils.parametrizations.weight_norm(conv_layer.conv, name="weight")
+            convolutions.append(conv_layer)
+
+        self.convolutions = nn.ModuleList(convolutions)
+
+        if not self.use_linear:
+            n_channels = out_dim
+
+        if self.lstm_type != "":
+            use_bilstm = False
+            lstm_channels = n_channels
+            if self.lstm_type == "bilstm":
+                use_bilstm = True
+                lstm_channels = int(n_channels // 2)
+
+            self.bilstm = nn.LSTM(n_channels, lstm_channels, 1, batch_first=True, bidirectional=use_bilstm)
+            lstm_norm_fn_pntr = nn.utils.spectral_norm
+            self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0")
+            if self.lstm_type == "bilstm":
+                self.bilstm = lstm_norm_fn_pntr(self.bilstm, "weight_hh_l0_reverse")
+
+        if self.use_linear:
+            self.dense = nn.Linear(n_channels, out_dim)
+
+    def run_padded_sequence(self, context, lens):
+        context_embedded = []
+        for b_ind in range(context.size()[0]):  # TODO: speed up
+            curr_context = context[b_ind : b_ind + 1, :, : lens[b_ind]].clone()
+            for conv in self.convolutions:
+                curr_context = self.dropout(F.relu(conv(curr_context)))
+            context_embedded.append(curr_context[0].transpose(0, 1))
+        context = nn.utils.rnn.pad_sequence(context_embedded, batch_first=True)
+        return context
+
+    def run_unsorted_inputs(self, fn, context, lens):  # pylint: disable=no-self-use
+        lens_sorted, ids_sorted = torch.sort(lens, descending=True)
+        unsort_ids = [0] * lens.size(0)
+        for i in range(len(ids_sorted)):  # pylint: disable=consider-using-enumerate
+            unsort_ids[ids_sorted[i]] = i
+        lens_sorted = lens_sorted.long().cpu()
+
+        context = context[ids_sorted]
+        context = nn.utils.rnn.pack_padded_sequence(context, lens_sorted, batch_first=True)
+        context = fn(context)[0]
+        context = nn.utils.rnn.pad_packed_sequence(context, batch_first=True)[0]
+
+        # map back to original indices
+        context = context[unsort_ids]
+        return context
+
+    def forward(self, context, lens):
+        if context.size()[0] > 1:
+            context = self.run_padded_sequence(context, lens)
+            # to B, D, T
+            context = context.transpose(1, 2)
+        else:
+            for conv in self.convolutions:
+                context = self.dropout(F.relu(conv(context)))
+
+        if self.lstm_type != "":
+            context = context.transpose(1, 2)
+            self.bilstm.flatten_parameters()
+            if lens is not None:
+                context = self.run_unsorted_inputs(self.bilstm, context, lens)
+            else:
+                context = self.bilstm(context)[0]
+            context = context.transpose(1, 2)
+
+        x_hat = context
+        if self.use_linear:
+            x_hat = self.dense(context.transpose(1, 2)).transpose(1, 2)
+
+        return x_hat
+
+
+class DepthWiseConv1d(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, padding: int):
+        super().__init__()
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding, groups=in_channels)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.conv(x)
+
+
+class PointwiseConv1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.conv(x)
+
+
+class BSConv1d(nn.Module):
+    """https://arxiv.org/pdf/2003.13549.pdf"""
+
+    def __init__(self, channels_in: int, channels_out: int, kernel_size: int, padding: int):
+        super().__init__()
+        self.pointwise = nn.Conv1d(channels_in, channels_out, kernel_size=1)
+        self.depthwise = nn.Conv1d(
+            channels_out,
+            channels_out,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=channels_out,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x1 = self.pointwise(x)
+        x2 = self.depthwise(x1)
+        return x2
+
+
+class BSConv2d(nn.Module):
+    """https://arxiv.org/pdf/2003.13549.pdf"""
+
+    def __init__(self, channels_in: int, channels_out: int, kernel_size: int, padding: int):
+        super().__init__()
+        self.pointwise = nn.Conv2d(channels_in, channels_out, kernel_size=1)
+        self.depthwise = nn.Conv2d(
+            channels_out,
+            channels_out,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=channels_out,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x1 = self.pointwise(x)
+        x2 = self.depthwise(x1)
+        return x2
+
+
+class Conv1dGLU(nn.Module):
+    """From DeepVoice 3"""
+
+    def __init__(self, d_model: int, kernel_size: int, padding: int, embedding_dim: int):
+        super().__init__()
+        self.conv = BSConv1d(d_model, 2 * d_model, kernel_size=kernel_size, padding=padding)
+        self.embedding_proj = nn.Linear(embedding_dim, d_model)
+        self.register_buffer("sqrt", torch.sqrt(torch.FloatTensor([0.5])).squeeze(0))
+        self.softsign = torch.nn.Softsign()
+
+    def forward(self, x: torch.Tensor, embeddings: torch.Tensor) -> torch.Tensor:
+        x = x.permute((0, 2, 1))
+        residual = x
+        x = self.conv(x)
+        splitdim = 1
+        a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
+        embeddings = self.embedding_proj(embeddings).unsqueeze(2)
+        softsign = self.softsign(embeddings)
+        softsign = softsign.expand_as(a)
+        a = a + softsign
+        x = a * torch.sigmoid(b)
+        x = x + residual
+        x = x * self.sqrt
+        x = x.permute((0, 2, 1))
+        return x
+
+
+class ConvTransposed(nn.Module):
+    """
+    A 1D convolutional transposed layer for PyTorch.
+    This layer applies a 1D convolutional transpose operation to its input tensor,
+    where the number of channels of the input tensor is the same as the number of channels of the output tensor.
+
+    Attributes:
+        in_channels (int): The number of channels in the input tensor.
+        out_channels (int): The number of channels in the output tensor.
+        kernel_size (int): The size of the convolutional kernel. Default: 1.
+        padding (int): The number of padding elements to add to the input tensor. Default: 0.
+        conv (BSConv1d): The 1D convolutional transpose layer.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 1,
+        padding: int = 0,
+    ):
+        super().__init__()
+        self.conv = BSConv1d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.contiguous().transpose(1, 2)
+        x = self.conv(x)
+        x = x.contiguous().transpose(1, 2)
+        return x
+
+
+class DepthwiseConvModule(nn.Module):
+    def __init__(self, dim: int, kernel_size: int = 7, expansion: int = 4, lrelu_slope: float = 0.3):
+        super().__init__()
+        padding = calc_same_padding(kernel_size)
+        self.depthwise = nn.Conv1d(
+            dim,
+            dim * expansion,
+            kernel_size=kernel_size,
+            padding=padding[0],
+            groups=dim,
+        )
+        self.act = nn.LeakyReLU(lrelu_slope)
+        self.out = nn.Conv1d(dim * expansion, dim, 1, 1, 0)
+        self.ln = nn.LayerNorm(dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln(x)
+        x = x.permute((0, 2, 1))
+        x = self.depthwise(x)
+        x = self.act(x)
+        x = self.out(x)
+        x = x.permute((0, 2, 1))
+        return x
+
+
+class AddCoords(nn.Module):
+    def __init__(self, rank: int, with_r: bool = False):
+        super().__init__()
+        self.rank = rank
+        self.with_r = with_r
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.rank == 1:
+            batch_size_shape, channel_in_shape, dim_x = x.shape  # pylint: disable=unused-variable
+            xx_range = torch.arange(dim_x, dtype=torch.int32)
+            xx_channel = xx_range[None, None, :]
+
+            xx_channel = xx_channel.float() / (dim_x - 1)
+            xx_channel = xx_channel * 2 - 1
+            xx_channel = xx_channel.repeat(batch_size_shape, 1, 1)
+
+            xx_channel = xx_channel.to(x.device)
+            out = torch.cat([x, xx_channel], dim=1)
+
+            if self.with_r:
+                rr = torch.sqrt(torch.pow(xx_channel - 0.5, 2))
+                out = torch.cat([out, rr], dim=1)
+
+        elif self.rank == 2:
+            batch_size_shape, channel_in_shape, dim_y, dim_x = x.shape
+            xx_ones = torch.ones([1, 1, 1, dim_x], dtype=torch.int32)
+            yy_ones = torch.ones([1, 1, 1, dim_y], dtype=torch.int32)
+
+            xx_range = torch.arange(dim_y, dtype=torch.int32)
+            yy_range = torch.arange(dim_x, dtype=torch.int32)
+            xx_range = xx_range[None, None, :, None]
+            yy_range = yy_range[None, None, :, None]
+
+            xx_channel = torch.matmul(xx_range, xx_ones)
+            yy_channel = torch.matmul(yy_range, yy_ones)
+
+            # transpose y
+            yy_channel = yy_channel.permute(0, 1, 3, 2)
+
+            xx_channel = xx_channel.float() / (dim_y - 1)
+            yy_channel = yy_channel.float() / (dim_x - 1)
+
+            xx_channel = xx_channel * 2 - 1
+            yy_channel = yy_channel * 2 - 1
+
+            xx_channel = xx_channel.repeat(batch_size_shape, 1, 1, 1)
+            yy_channel = yy_channel.repeat(batch_size_shape, 1, 1, 1)
+
+            xx_channel = xx_channel.to(x.device)
+            yy_channel = yy_channel.to(x.device)
+
+            out = torch.cat([x, xx_channel, yy_channel], dim=1)
+
+            if self.with_r:
+                rr = torch.sqrt(torch.pow(xx_channel - 0.5, 2) + torch.pow(yy_channel - 0.5, 2))
+                out = torch.cat([out, rr], dim=1)
+
+        elif self.rank == 3:
+            batch_size_shape, channel_in_shape, dim_z, dim_y, dim_x = x.shape
+            xx_ones = torch.ones([1, 1, 1, 1, dim_x], dtype=torch.int32)
+            yy_ones = torch.ones([1, 1, 1, 1, dim_y], dtype=torch.int32)
+            zz_ones = torch.ones([1, 1, 1, 1, dim_z], dtype=torch.int32)
+
+            xy_range = torch.arange(dim_y, dtype=torch.int32)
+            xy_range = xy_range[None, None, None, :, None]
+
+            yz_range = torch.arange(dim_z, dtype=torch.int32)
+            yz_range = yz_range[None, None, None, :, None]
+
+            zx_range = torch.arange(dim_x, dtype=torch.int32)
+            zx_range = zx_range[None, None, None, :, None]
+
+            xy_channel = torch.matmul(xy_range, xx_ones)
+            xx_channel = torch.cat([xy_channel + i for i in range(dim_z)], dim=2)
+
+            yz_channel = torch.matmul(yz_range, yy_ones)
+            yz_channel = yz_channel.permute(0, 1, 3, 4, 2)
+            yy_channel = torch.cat([yz_channel + i for i in range(dim_x)], dim=4)
+
+            zx_channel = torch.matmul(zx_range, zz_ones)
+            zx_channel = zx_channel.permute(0, 1, 4, 2, 3)
+            zz_channel = torch.cat([zx_channel + i for i in range(dim_y)], dim=3)
+
+            xx_channel = xx_channel.to(x.device)
+            yy_channel = yy_channel.to(x.device)
+            zz_channel = zz_channel.to(x.device)
+            out = torch.cat([x, xx_channel, yy_channel, zz_channel], dim=1)
+
+            if self.with_r:
+                rr = torch.sqrt(
+                    torch.pow(xx_channel - 0.5, 2) + torch.pow(yy_channel - 0.5, 2) + torch.pow(zz_channel - 0.5, 2)
+                )
+                out = torch.cat([out, rr], dim=1)
+        else:
+            raise NotImplementedError
+
+        return out
+
+
+class CoordConv1d(nn.modules.conv.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        with_r: bool = False,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+        )
+        self.rank = 1
+        self.addcoords = AddCoords(self.rank, with_r)
+        self.conv = nn.Conv1d(
+            in_channels + self.rank + int(with_r),
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.addcoords(x)
+        x = self.conv(x)
+        return x
+
+
+class CoordConv2d(nn.modules.conv.Conv2d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        with_r: bool = False,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+        )
+        self.rank = 2
+        self.addcoords = AddCoords(self.rank, with_r)
+        self.conv = nn.Conv2d(
+            in_channels + self.rank + int(with_r),
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.addcoords(x)
+        x = self.conv(x)
+        return x
+
+
+class LVCBlock(torch.nn.Module):
+    """the location-variable convolutions"""
+
+    def __init__(  # pylint: disable=dangerous-default-value
+        self,
+        in_channels,
+        cond_channels,
+        stride,
+        dilations=[1, 3, 9, 27],
+        lReLU_slope=0.2,
+        conv_kernel_size=3,
+        cond_hop_length=256,
+        kpnet_hidden_channels=64,
+        kpnet_conv_size=3,
+        kpnet_dropout=0.0,
+    ):
+        super().__init__()
+
+        self.cond_hop_length = cond_hop_length
+        self.conv_layers = len(dilations)
+        self.conv_kernel_size = conv_kernel_size
+
+        self.kernel_predictor = KernelPredictor(
+            cond_channels=cond_channels,
+            conv_in_channels=in_channels,
+            conv_out_channels=2 * in_channels,
+            conv_layers=len(dilations),
+            conv_kernel_size=conv_kernel_size,
+            kpnet_hidden_channels=kpnet_hidden_channels,
+            kpnet_conv_size=kpnet_conv_size,
+            kpnet_dropout=kpnet_dropout,
+            kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope},
+        )
+
+        self.convt_pre = nn.Sequential(
+            nn.LeakyReLU(lReLU_slope),
+            nn.utils.parametrizations.weight_norm(
+                nn.ConvTranspose1d(
+                    in_channels,
+                    in_channels,
+                    2 * stride,
+                    stride=stride,
+                    padding=stride // 2 + stride % 2,
+                    output_padding=stride % 2,
+                )
+            ),
+        )
+
+        self.conv_blocks = nn.ModuleList()
+        for dilation in dilations:
+            self.conv_blocks.append(
+                nn.Sequential(
+                    nn.LeakyReLU(lReLU_slope),
+                    nn.utils.parametrizations.weight_norm(
+                        nn.Conv1d(
+                            in_channels,
+                            in_channels,
+                            conv_kernel_size,
+                            padding=dilation * (conv_kernel_size - 1) // 2,
+                            dilation=dilation,
+                        )
+                    ),
+                    nn.LeakyReLU(lReLU_slope),
+                )
+            )
+
+    def forward(self, x, c):
+        """forward propagation of the location-variable convolutions.
+        Args:
+            x (Tensor): the input sequence (batch, in_channels, in_length)
+            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
+
+        Returns:
+            Tensor: the output sequence (batch, in_channels, in_length)
+        """
+        _, in_channels, _ = x.shape  # (B, c_g, L')
+
+        x = self.convt_pre(x)  # (B, c_g, stride * L')
+        kernels, bias = self.kernel_predictor(c)
+
+        for i, conv in enumerate(self.conv_blocks):
+            output = conv(x)  # (B, c_g, stride * L')
+
+            k = kernels[:, i, :, :, :, :]  # (B, 2 * c_g, c_g, kernel_size, cond_length)
+            b = bias[:, i, :, :]  # (B, 2 * c_g, cond_length)
+
+            output = self.location_variable_convolution(
+                output, k, b, hop_size=self.cond_hop_length
+            )  # (B, 2 * c_g, stride * L'): LVC
+            x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh(
+                output[:, in_channels:, :]
+            )  # (B, c_g, stride * L'): GAU
+
+        return x
+
+    def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256):  # pylint: disable=no-self-use
+        """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
+        Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
+        Args:
+            x (Tensor): the input sequence (batch, in_channels, in_length).
+            kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
+            bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
+            dilation (int): the dilation of convolution.
+            hop_size (int): the hop_size of the conditioning sequence.
+        Returns:
+            (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
+        """
+        batch, _, in_length = x.shape
+        batch, _, out_channels, kernel_size, kernel_length = kernel.shape
+        assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"
+
+        padding = dilation * int((kernel_size - 1) / 2)
+        x = F.pad(x, (padding, padding), "constant", 0)  # (batch, in_channels, in_length + 2*padding)
+        x = x.unfold(2, hop_size + 2 * padding, hop_size)  # (batch, in_channels, kernel_length, hop_size + 2*padding)
+
+        if hop_size < dilation:
+            x = F.pad(x, (0, dilation), "constant", 0)
+        x = x.unfold(
+            3, dilation, dilation
+        )  # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
+        x = x[:, :, :, :, :hop_size]
+        x = x.transpose(3, 4)  # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
+        x = x.unfold(4, kernel_size, 1)  # (batch, in_channels, kernel_length, dilation, _, kernel_size)
+
+        o = torch.einsum("bildsk,biokl->bolsd", x, kernel)
+        o = o.to(memory_format=torch.channels_last_3d)
+        bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
+        o = o + bias
+        o = o.contiguous().view(batch, out_channels, -1)
+
+        return o
+
+    def remove_weight_norm(self):
+        self.kernel_predictor.remove_weight_norm()
+        parametrize.remove_parametrizations(self.convt_pre[1], "weight")
+        for block in self.conv_blocks:
+            parametrize.remove_parametrizations(block[1], "weight")
--- a/TTS/tts/layers/delightful_tts/encoders.py
+++ b/TTS/tts/layers/delightful_tts/encoders.py
@ -0,0 +1,261 @@
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn  # pylint: disable=consider-using-from-import
+import torch.nn.functional as F
+
+from TTS.tts.layers.delightful_tts.conformer import ConformerMultiHeadedSelfAttention
+from TTS.tts.layers.delightful_tts.conv_layers import CoordConv1d
+from TTS.tts.layers.delightful_tts.networks import STL
+
+
+def get_mask_from_lengths(lengths: torch.Tensor) -> torch.Tensor:
+    batch_size = lengths.shape[0]
+    max_len = torch.max(lengths).item()
+    ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1)
+    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
+    return mask
+
+
+def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor:
+    return torch.ceil(lens / stride).int()
+
+
+class ReferenceEncoder(nn.Module):
+    """
+    Referance encoder for utterance and phoneme prosody encoders. Reference encoder
+    made up of convolution and RNN layers.
+
+    Args:
+        num_mels (int): Number of mel frames to produce.
+        ref_enc_filters (list[int]): List of channel sizes for encoder layers.
+        ref_enc_size (int): Size of the kernel for the conv layers.
+        ref_enc_strides (List[int]): List of strides to use for conv layers.
+        ref_enc_gru_size (int): Number of hidden features for the gated recurrent unit.
+
+    Inputs: inputs, mask
+        - **inputs** (batch, dim, time): Tensor containing mel vector
+        - **lengths** (batch): Tensor containing the mel lengths.
+    Returns:
+        - **outputs** (batch, time, dim): Tensor produced by Reference Encoder.
+    """
+
+    def __init__(
+        self,
+        num_mels: int,
+        ref_enc_filters: List[Union[int, int, int, int, int, int]],
+        ref_enc_size: int,
+        ref_enc_strides: List[Union[int, int, int, int, int]],
+        ref_enc_gru_size: int,
+    ):
+        super().__init__()
+
+        n_mel_channels = num_mels
+        self.n_mel_channels = n_mel_channels
+        K = len(ref_enc_filters)
+        filters = [self.n_mel_channels] + ref_enc_filters
+        strides = [1] + ref_enc_strides
+        # Use CoordConv at the first layer to better preserve positional information: https://arxiv.org/pdf/1811.02122.pdf
+        convs = [
+            CoordConv1d(
+                in_channels=filters[0],
+                out_channels=filters[0 + 1],
+                kernel_size=ref_enc_size,
+                stride=strides[0],
+                padding=ref_enc_size // 2,
+                with_r=True,
+            )
+        ]
+        convs2 = [
+            nn.Conv1d(
+                in_channels=filters[i],
+                out_channels=filters[i + 1],
+                kernel_size=ref_enc_size,
+                stride=strides[i],
+                padding=ref_enc_size // 2,
+            )
+            for i in range(1, K)
+        ]
+        convs.extend(convs2)
+        self.convs = nn.ModuleList(convs)
+
+        self.norms = nn.ModuleList([nn.InstanceNorm1d(num_features=ref_enc_filters[i], affine=True) for i in range(K)])
+
+        self.gru = nn.GRU(
+            input_size=ref_enc_filters[-1],
+            hidden_size=ref_enc_gru_size,
+            batch_first=True,
+        )
+
+    def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        inputs --- [N,  n_mels, timesteps]
+        outputs --- [N, E//2]
+        """
+
+        mel_masks = get_mask_from_lengths(mel_lens).unsqueeze(1)
+        x = x.masked_fill(mel_masks, 0)
+        for conv, norm in zip(self.convs, self.norms):
+            x = conv(x)
+            x = F.leaky_relu(x, 0.3)  # [N, 128, Ty//2^K, n_mels//2^K]
+            x = norm(x)
+
+        for _ in range(2):
+            mel_lens = stride_lens(mel_lens)
+
+        mel_masks = get_mask_from_lengths(mel_lens)
+
+        x = x.masked_fill(mel_masks.unsqueeze(1), 0)
+        x = x.permute((0, 2, 1))
+        x = torch.nn.utils.rnn.pack_padded_sequence(x, mel_lens.cpu().int(), batch_first=True, enforce_sorted=False)
+
+        self.gru.flatten_parameters()
+        x, memory = self.gru(x)  # memory --- [N, Ty, E//2], out --- [1, N, E//2]
+        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
+
+        return x, memory, mel_masks
+
+    def calculate_channels(  # pylint: disable=no-self-use
+        self, L: int, kernel_size: int, stride: int, pad: int, n_convs: int
+    ) -> int:
+        for _ in range(n_convs):
+            L = (L - kernel_size + 2 * pad) // stride + 1
+        return L
+
+
+class UtteranceLevelProsodyEncoder(nn.Module):
+    def __init__(
+        self,
+        num_mels: int,
+        ref_enc_filters: List[Union[int, int, int, int, int, int]],
+        ref_enc_size: int,
+        ref_enc_strides: List[Union[int, int, int, int, int]],
+        ref_enc_gru_size: int,
+        dropout: float,
+        n_hidden: int,
+        bottleneck_size_u: int,
+        token_num: int,
+    ):
+        """
+        Encoder to extract prosody from utterance. it is made up of a reference encoder
+        with a couple of linear layers and style token layer with dropout.
+
+        Args:
+            num_mels (int): Number of mel frames to produce.
+            ref_enc_filters (list[int]): List of channel sizes for ref encoder layers.
+            ref_enc_size (int): Size of the kernel for the ref encoder conv layers.
+            ref_enc_strides (List[int]): List of strides to use for teh ref encoder conv layers.
+            ref_enc_gru_size (int): Number of hidden features for the gated recurrent unit.
+            dropout (float): Probability of dropout.
+            n_hidden (int): Size of hidden layers.
+            bottleneck_size_u (int): Size of the bottle neck layer.
+
+        Inputs: inputs, mask
+            - **inputs** (batch, dim, time): Tensor containing mel vector
+            - **lengths** (batch): Tensor containing the mel lengths.
+        Returns:
+            - **outputs** (batch, 1, dim): Tensor produced by Utterance Level Prosody Encoder.
+        """
+        super().__init__()
+
+        self.E = n_hidden
+        self.d_q = self.d_k = n_hidden
+        bottleneck_size = bottleneck_size_u
+
+        self.encoder = ReferenceEncoder(
+            ref_enc_filters=ref_enc_filters,
+            ref_enc_gru_size=ref_enc_gru_size,
+            ref_enc_size=ref_enc_size,
+            ref_enc_strides=ref_enc_strides,
+            num_mels=num_mels,
+        )
+        self.encoder_prj = nn.Linear(ref_enc_gru_size, self.E // 2)
+        self.stl = STL(n_hidden=n_hidden, token_num=token_num)
+        self.encoder_bottleneck = nn.Linear(self.E, bottleneck_size)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, mels: torch.Tensor, mel_lens: torch.Tensor) -> torch.Tensor:
+        """
+        Shapes:
+            mels: :math: `[B, C, T]`
+            mel_lens: :math: `[B]`
+
+        out --- [N, seq_len, E]
+        """
+        _, embedded_prosody, _ = self.encoder(mels, mel_lens)
+
+        # Bottleneck
+        embedded_prosody = self.encoder_prj(embedded_prosody)
+
+        # Style Token
+        out = self.encoder_bottleneck(self.stl(embedded_prosody))
+        out = self.dropout(out)
+
+        out = out.view((-1, 1, out.shape[3]))
+        return out
+
+
+class PhonemeLevelProsodyEncoder(nn.Module):
+    def __init__(
+        self,
+        num_mels: int,
+        ref_enc_filters: List[Union[int, int, int, int, int, int]],
+        ref_enc_size: int,
+        ref_enc_strides: List[Union[int, int, int, int, int]],
+        ref_enc_gru_size: int,
+        dropout: float,
+        n_hidden: int,
+        n_heads: int,
+        bottleneck_size_p: int,
+    ):
+        super().__init__()
+
+        self.E = n_hidden
+        self.d_q = self.d_k = n_hidden
+        bottleneck_size = bottleneck_size_p
+
+        self.encoder = ReferenceEncoder(
+            ref_enc_filters=ref_enc_filters,
+            ref_enc_gru_size=ref_enc_gru_size,
+            ref_enc_size=ref_enc_size,
+            ref_enc_strides=ref_enc_strides,
+            num_mels=num_mels,
+        )
+        self.encoder_prj = nn.Linear(ref_enc_gru_size, n_hidden)
+        self.attention = ConformerMultiHeadedSelfAttention(
+            d_model=n_hidden,
+            num_heads=n_heads,
+            dropout_p=dropout,
+        )
+        self.encoder_bottleneck = nn.Linear(n_hidden, bottleneck_size)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        src_mask: torch.Tensor,
+        mels: torch.Tensor,
+        mel_lens: torch.Tensor,
+        encoding: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        x --- [N, seq_len, encoder_embedding_dim]
+        mels --- [N, Ty/r, n_mels*r], r=1
+        out --- [N, seq_len, bottleneck_size]
+        attn --- [N, seq_len, ref_len], Ty/r = ref_len
+        """
+        embedded_prosody, _, mel_masks = self.encoder(mels, mel_lens)
+
+        # Bottleneck
+        embedded_prosody = self.encoder_prj(embedded_prosody)
+
+        attn_mask = mel_masks.view((mel_masks.shape[0], 1, 1, -1))
+        x, _ = self.attention(
+            query=x,
+            key=embedded_prosody,
+            value=embedded_prosody,
+            mask=attn_mask,
+            encoding=encoding,
+        )
+        x = self.encoder_bottleneck(x)
+        x = x.masked_fill(src_mask.unsqueeze(-1), 0.0)
+        return x
--- a/TTS/tts/layers/delightful_tts/energy_adaptor.py
+++ b/TTS/tts/layers/delightful_tts/energy_adaptor.py
@ -0,0 +1,82 @@
+from typing import Callable, Tuple
+
+import torch
+import torch.nn as nn  # pylint: disable=consider-using-from-import
+
+from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
+from TTS.tts.utils.helpers import average_over_durations
+
+
+class EnergyAdaptor(nn.Module):  # pylint: disable=abstract-method
+    """Variance Adaptor with an added 1D conv layer. Used to
+    get energy embeddings.
+
+    Args:
+        channels_in (int): Number of in channels for conv layers.
+        channels_out (int): Number of out channels.
+        kernel_size (int): Size the kernel for the conv layers.
+        dropout (float): Probability of dropout.
+        lrelu_slope (float): Slope for the leaky relu.
+        emb_kernel_size (int): Size the kernel for the pitch embedding.
+
+    Inputs: inputs, mask
+        - **inputs** (batch, time1, dim): Tensor containing input vector
+        - **target** (batch, 1, time2): Tensor containing the energy target
+        - **dr** (batch, time1): Tensor containing aligner durations vector
+        - **mask** (batch, time1): Tensor containing indices to be masked
+    Returns:
+        - **energy prediction** (batch, 1, time1): Tensor produced by energy predictor
+        - **energy embedding** (batch, channels, time1): Tensor produced energy adaptor
+        - **average energy target(train only)** (batch, 1, time1): Tensor produced after averaging over durations
+
+    """
+
+    def __init__(
+        self,
+        channels_in: int,
+        channels_hidden: int,
+        channels_out: int,
+        kernel_size: int,
+        dropout: float,
+        lrelu_slope: float,
+        emb_kernel_size: int,
+    ):
+        super().__init__()
+        self.energy_predictor = VariancePredictor(
+            channels_in=channels_in,
+            channels=channels_hidden,
+            channels_out=channels_out,
+            kernel_size=kernel_size,
+            p_dropout=dropout,
+            lrelu_slope=lrelu_slope,
+        )
+        self.energy_emb = nn.Conv1d(
+            1,
+            channels_hidden,
+            kernel_size=emb_kernel_size,
+            padding=int((emb_kernel_size - 1) / 2),
+        )
+
+    def get_energy_embedding_train(
+        self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Shapes:
+            x: :math: `[B, T_src, C]`
+            target: :math: `[B, 1, T_max2]`
+            dr: :math: `[B, T_src]`
+            mask: :math: `[B, T_src]`
+        """
+        energy_pred = self.energy_predictor(x, mask)
+        energy_pred.unsqueeze_(1)
+        avg_energy_target = average_over_durations(target, dr)
+        energy_emb = self.energy_emb(avg_energy_target)
+        return energy_pred, avg_energy_target, energy_emb
+
+    def get_energy_embedding(self, x: torch.Tensor, mask: torch.Tensor, energy_transform: Callable) -> torch.Tensor:
+        energy_pred = self.energy_predictor(x, mask)
+        energy_pred.unsqueeze_(1)
+        if energy_transform is not None:
+            energy_pred = energy_transform(energy_pred, (~mask).sum(dim=(1, 2)), self.pitch_mean, self.pitch_std)
+        energy_emb_pred = self.energy_emb(energy_pred)
+        return energy_emb_pred, energy_pred
--- a/TTS/tts/layers/delightful_tts/kernel_predictor.py
+++ b/TTS/tts/layers/delightful_tts/kernel_predictor.py
@ -0,0 +1,128 @@
+import torch.nn as nn  # pylint: disable=consider-using-from-import
+from torch.nn.utils import parametrize
+
+
+class KernelPredictor(nn.Module):
+    """Kernel predictor for the location-variable convolutions
+
+    Args:
+            cond_channels (int): number of channel for the conditioning sequence,
+            conv_in_channels (int): number of channel for the input sequence,
+            conv_out_channels (int): number of channel for the output sequence,
+            conv_layers (int): number of layers
+
+    """
+
+    def __init__(  # pylint: disable=dangerous-default-value
+        self,
+        cond_channels,
+        conv_in_channels,
+        conv_out_channels,
+        conv_layers,
+        conv_kernel_size=3,
+        kpnet_hidden_channels=64,
+        kpnet_conv_size=3,
+        kpnet_dropout=0.0,
+        kpnet_nonlinear_activation="LeakyReLU",
+        kpnet_nonlinear_activation_params={"negative_slope": 0.1},
+    ):
+        super().__init__()
+
+        self.conv_in_channels = conv_in_channels
+        self.conv_out_channels = conv_out_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_layers = conv_layers
+
+        kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers  # l_w
+        kpnet_bias_channels = conv_out_channels * conv_layers  # l_b
+
+        self.input_conv = nn.Sequential(
+            nn.utils.parametrizations.weight_norm(
+                nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)
+            ),
+            getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
+        )
+
+        self.residual_convs = nn.ModuleList()
+        padding = (kpnet_conv_size - 1) // 2
+        for _ in range(3):
+            self.residual_convs.append(
+                nn.Sequential(
+                    nn.Dropout(kpnet_dropout),
+                    nn.utils.parametrizations.weight_norm(
+                        nn.Conv1d(
+                            kpnet_hidden_channels,
+                            kpnet_hidden_channels,
+                            kpnet_conv_size,
+                            padding=padding,
+                            bias=True,
+                        )
+                    ),
+                    getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
+                    nn.utils.parametrizations.weight_norm(
+                        nn.Conv1d(
+                            kpnet_hidden_channels,
+                            kpnet_hidden_channels,
+                            kpnet_conv_size,
+                            padding=padding,
+                            bias=True,
+                        )
+                    ),
+                    getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
+                )
+            )
+        self.kernel_conv = nn.utils.parametrizations.weight_norm(
+            nn.Conv1d(
+                kpnet_hidden_channels,
+                kpnet_kernel_channels,
+                kpnet_conv_size,
+                padding=padding,
+                bias=True,
+            )
+        )
+        self.bias_conv = nn.utils.parametrizations.weight_norm(
+            nn.Conv1d(
+                kpnet_hidden_channels,
+                kpnet_bias_channels,
+                kpnet_conv_size,
+                padding=padding,
+                bias=True,
+            )
+        )
+
+    def forward(self, c):
+        """
+        Args:
+            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
+        """
+        batch, _, cond_length = c.shape
+        c = self.input_conv(c)
+        for residual_conv in self.residual_convs:
+            residual_conv.to(c.device)
+            c = c + residual_conv(c)
+        k = self.kernel_conv(c)
+        b = self.bias_conv(c)
+        kernels = k.contiguous().view(
+            batch,
+            self.conv_layers,
+            self.conv_in_channels,
+            self.conv_out_channels,
+            self.conv_kernel_size,
+            cond_length,
+        )
+        bias = b.contiguous().view(
+            batch,
+            self.conv_layers,
+            self.conv_out_channels,
+            cond_length,
+        )
+
+        return kernels, bias
+
+    def remove_weight_norm(self):
+        parametrize.remove_parametrizations(self.input_conv[0], "weight")
+        parametrize.remove_parametrizations(self.kernel_conv, "weight")
+        parametrize.remove_parametrizations(self.bias_conv, "weight")
+        for block in self.residual_convs:
+            parametrize.remove_parametrizations(block[1], "weight")
+            parametrize.remove_parametrizations(block[3], "weight")
--- a/TTS/tts/layers/delightful_tts/networks.py
+++ b/TTS/tts/layers/delightful_tts/networks.py
@ -0,0 +1,219 @@
+import math
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn  # pylint: disable=consider-using-from-import
+import torch.nn.functional as F
+
+from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm
+
+
+def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor:
+    assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..."
+    # Kaiming initialization
+    return torch.randn(shape) * np.sqrt(2 / shape[1])
+
+
+def positional_encoding(d_model: int, length: int, device: torch.device) -> torch.Tensor:
+    pe = torch.zeros(length, d_model, device=device)
+    position = torch.arange(0, length, dtype=torch.float, device=device).unsqueeze(1)
+    div_term = torch.exp(torch.arange(0, d_model, 2, device=device).float() * -(math.log(10000.0) / d_model))
+    pe[:, 0::2] = torch.sin(position * div_term)
+    pe[:, 1::2] = torch.cos(position * div_term)
+    pe = pe.unsqueeze(0)
+    return pe
+
+
+class BottleneckLayer(nn.Module):
+    """
+    Bottleneck layer for reducing the dimensionality of a tensor.
+
+    Args:
+        in_dim: The number of input dimensions.
+        reduction_factor: The factor by which to reduce the number of dimensions.
+        norm: The normalization method to use. Can be "weightnorm" or "instancenorm".
+        non_linearity: The non-linearity to use. Can be "relu" or "leakyrelu".
+        kernel_size: The size of the convolutional kernel.
+        use_partial_padding: Whether to use partial padding with the convolutional kernel.
+
+    Shape:
+        - Input: :math:`[N, in_dim]` where `N` is the batch size and `in_dim` is the number of input dimensions.
+
+        - Output: :math:`[N, out_dim]` where `out_dim` is the number of output dimensions.
+    """
+
+    def __init__(
+        self,
+        in_dim,
+        reduction_factor,
+        norm="weightnorm",
+        non_linearity="relu",
+        kernel_size=3,
+        use_partial_padding=False,  # pylint: disable=unused-argument
+    ):
+        super(BottleneckLayer, self).__init__()  # pylint: disable=super-with-arguments
+
+        self.reduction_factor = reduction_factor
+        reduced_dim = int(in_dim / reduction_factor)
+        self.out_dim = reduced_dim
+        if self.reduction_factor > 1:
+            fn = ConvNorm(in_dim, reduced_dim, kernel_size=kernel_size, use_weight_norm=(norm == "weightnorm"))
+            if norm == "instancenorm":
+                fn = nn.Sequential(fn, nn.InstanceNorm1d(reduced_dim, affine=True))
+
+            self.projection_fn = fn
+            self.non_linearity = nn.ReLU()
+            if non_linearity == "leakyrelu":
+                self.non_linearity = nn.LeakyReLU()
+
+    def forward(self, x):
+        if self.reduction_factor > 1:
+            x = self.projection_fn(x)
+            x = self.non_linearity(x)
+        return x
+
+
+class GLUActivation(nn.Module):
+    """Class that implements the Gated Linear Unit (GLU) activation function.
+
+    The GLU activation function is a variant of the Leaky ReLU activation function,
+    where the output of the activation function is gated by an input tensor.
+
+    """
+
+    def __init__(self, slope: float):
+        super().__init__()
+        self.lrelu = nn.LeakyReLU(slope)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out, gate = x.chunk(2, dim=1)
+        x = out * self.lrelu(gate)
+        return x
+
+
+class StyleEmbedAttention(nn.Module):
+    def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
+        super().__init__()
+        self.num_units = num_units
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+
+        self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
+        self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+        self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
+
+    def forward(self, query: torch.Tensor, key_soft: torch.Tensor) -> torch.Tensor:
+        values = self.W_value(key_soft)
+        split_size = self.num_units // self.num_heads
+        values = torch.stack(torch.split(values, split_size, dim=2), dim=0)
+
+        out_soft = scores_soft = None
+        querys = self.W_query(query)  # [N, T_q, num_units]
+        keys = self.W_key(key_soft)  # [N, T_k, num_units]
+
+        # [h, N, T_q, num_units/h]
+        querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)
+        # [h, N, T_k, num_units/h]
+        keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)
+        # [h, N, T_k, num_units/h]
+
+        # score = softmax(QK^T / (d_k ** 0.5))
+        scores_soft = torch.matmul(querys, keys.transpose(2, 3))  # [h, N, T_q, T_k]
+        scores_soft = scores_soft / (self.key_dim**0.5)
+        scores_soft = F.softmax(scores_soft, dim=3)
+
+        # out = score * V
+        # [h, N, T_q, num_units/h]
+        out_soft = torch.matmul(scores_soft, values)
+        out_soft = torch.cat(torch.split(out_soft, 1, dim=0), dim=3).squeeze(0)  # [N, T_q, num_units]
+
+        return out_soft  # , scores_soft
+
+
+class EmbeddingPadded(nn.Module):
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        super().__init__()
+        padding_mult = torch.ones((num_embeddings, 1), dtype=torch.int64)
+        padding_mult[padding_idx] = 0
+        self.register_buffer("padding_mult", padding_mult)
+        self.embeddings = nn.parameter.Parameter(initialize_embeddings((num_embeddings, embedding_dim)))
+
+    def forward(self, idx: torch.Tensor) -> torch.Tensor:
+        embeddings_zeroed = self.embeddings * self.padding_mult
+        x = F.embedding(idx, embeddings_zeroed)
+        return x
+
+
+class EmbeddingProjBlock(nn.Module):
+    def __init__(self, embedding_dim: int):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                nn.Linear(embedding_dim, embedding_dim),
+                nn.LeakyReLU(0.3),
+                nn.Linear(embedding_dim, embedding_dim),
+                nn.LeakyReLU(0.3),
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        res = x
+        for layer in self.layers:
+            x = layer(x)
+        x = x + res
+        return x
+
+
+class LinearNorm(nn.Module):
+    def __init__(self, in_features: int, out_features: int, bias: bool = False):
+        super().__init__()
+        self.linear = nn.Linear(in_features, out_features, bias)
+
+        nn.init.xavier_uniform_(self.linear.weight)
+        if bias:
+            nn.init.constant_(self.linear.bias, 0.0)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.linear(x)
+        return x
+
+
+class STL(nn.Module):
+    """
+    A PyTorch module for the Style Token Layer (STL) as described in
+    "A Style-Based Generator Architecture for Generative Adversarial Networks"
+    (https://arxiv.org/abs/1812.04948)
+
+    The STL applies a multi-headed attention mechanism over the learned style tokens,
+    using the text input as the query and the style tokens as the keys and values.
+    The output of the attention mechanism is used as the text's style embedding.
+
+    Args:
+        token_num (int): The number of style tokens.
+        n_hidden (int): Number of hidden dimensions.
+    """
+
+    def __init__(self, n_hidden: int, token_num: int):
+        super(STL, self).__init__()  # pylint: disable=super-with-arguments
+
+        num_heads = 1
+        E = n_hidden
+        self.token_num = token_num
+        self.embed = nn.Parameter(torch.FloatTensor(self.token_num, E // num_heads))
+        d_q = E // 2
+        d_k = E // num_heads
+        self.attention = StyleEmbedAttention(query_dim=d_q, key_dim=d_k, num_units=E, num_heads=num_heads)
+
+        torch.nn.init.normal_(self.embed, mean=0, std=0.5)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        N = x.size(0)
+        query = x.unsqueeze(1)  # [N, 1, E//2]
+
+        keys_soft = torch.tanh(self.embed).unsqueeze(0).expand(N, -1, -1)  # [N, token_num, E // num_heads]
+
+        # Weighted sum
+        emotion_embed_soft = self.attention(query, keys_soft)
+
+        return emotion_embed_soft
--- a/TTS/tts/layers/delightful_tts/phoneme_prosody_predictor.py
+++ b/TTS/tts/layers/delightful_tts/phoneme_prosody_predictor.py
@ -0,0 +1,65 @@
+import torch
+import torch.nn as nn  # pylint: disable=consider-using-from-import
+
+from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed
+
+
+class PhonemeProsodyPredictor(nn.Module):
+    """Non-parallel Prosody Predictor inspired by: https://arxiv.org/pdf/2102.00851.pdf
+    It consists of 2 layers of  1D convolutions each followed by a relu activation, layer norm
+    and dropout, then finally a linear layer.
+
+    Args:
+        hidden_size (int): Size of hidden channels.
+        kernel_size (int): Kernel size for the conv layers.
+        dropout: (float): Probability of dropout.
+        bottleneck_size (int): bottleneck size for last linear layer.
+        lrelu_slope (float): Slope of the leaky relu.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        kernel_size: int,
+        dropout: float,
+        bottleneck_size: int,
+        lrelu_slope: float,
+    ):
+        super().__init__()
+        self.d_model = hidden_size
+        self.layers = nn.ModuleList(
+            [
+                ConvTransposed(
+                    self.d_model,
+                    self.d_model,
+                    kernel_size=kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                ),
+                nn.LeakyReLU(lrelu_slope),
+                nn.LayerNorm(self.d_model),
+                nn.Dropout(dropout),
+                ConvTransposed(
+                    self.d_model,
+                    self.d_model,
+                    kernel_size=kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                ),
+                nn.LeakyReLU(lrelu_slope),
+                nn.LayerNorm(self.d_model),
+                nn.Dropout(dropout),
+            ]
+        )
+        self.predictor_bottleneck = nn.Linear(self.d_model, bottleneck_size)
+
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        Shapes:
+            x: :math: `[B, T, D]`
+            mask: :math: `[B, T]`
+        """
+        mask = mask.unsqueeze(2)
+        for layer in self.layers:
+            x = layer(x)
+        x = x.masked_fill(mask, 0.0)
+        x = self.predictor_bottleneck(x)
+        return x
--- a/TTS/tts/layers/delightful_tts/pitch_adaptor.py
+++ b/TTS/tts/layers/delightful_tts/pitch_adaptor.py
@ -0,0 +1,88 @@
+from typing import Callable, Tuple
+
+import torch
+import torch.nn as nn  # pylint: disable=consider-using-from-import
+
+from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
+from TTS.tts.utils.helpers import average_over_durations
+
+
+class PitchAdaptor(nn.Module):  # pylint: disable=abstract-method
+    """Module to get pitch embeddings via pitch predictor
+
+    Args:
+        n_input (int): Number of pitch predictor input channels.
+        n_hidden (int): Number of pitch predictor hidden channels.
+        n_out (int): Number of pitch predictor out channels.
+        kernel size (int): Size of the kernel for conv layers.
+        emb_kernel_size (int): Size the kernel for the pitch embedding.
+        p_dropout (float): Probability of dropout.
+        lrelu_slope (float): Slope for the leaky relu.
+
+    Inputs: inputs, mask
+        - **inputs** (batch, time1, dim): Tensor containing input vector
+        - **target** (batch, 1, time2): Tensor containing the pitch target
+        - **dr** (batch, time1): Tensor containing aligner durations vector
+        - **mask** (batch, time1): Tensor containing indices to be masked
+    Returns:
+        - **pitch prediction** (batch, 1, time1): Tensor produced by pitch predictor
+        - **pitch embedding** (batch, channels, time1): Tensor produced pitch pitch adaptor
+        - **average pitch target(train only)** (batch, 1, time1): Tensor produced after averaging over durations
+    """
+
+    def __init__(
+        self,
+        n_input: int,
+        n_hidden: int,
+        n_out: int,
+        kernel_size: int,
+        emb_kernel_size: int,
+        p_dropout: float,
+        lrelu_slope: float,
+    ):
+        super().__init__()
+        self.pitch_predictor = VariancePredictor(
+            channels_in=n_input,
+            channels=n_hidden,
+            channels_out=n_out,
+            kernel_size=kernel_size,
+            p_dropout=p_dropout,
+            lrelu_slope=lrelu_slope,
+        )
+        self.pitch_emb = nn.Conv1d(
+            1,
+            n_input,
+            kernel_size=emb_kernel_size,
+            padding=int((emb_kernel_size - 1) / 2),
+        )
+
+    def get_pitch_embedding_train(
+        self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Shapes:
+            x: :math: `[B, T_src, C]`
+            target: :math: `[B, 1, T_max2]`
+            dr: :math: `[B, T_src]`
+            mask: :math: `[B, T_src]`
+        """
+        pitch_pred = self.pitch_predictor(x, mask)  # [B, T_src, C_hidden], [B, T_src] --> [B, T_src]
+        pitch_pred.unsqueeze_(1)  # --> [B, 1, T_src]
+        avg_pitch_target = average_over_durations(target, dr)  # [B, 1, T_mel], [B, T_src] --> [B, 1, T_src]
+        pitch_emb = self.pitch_emb(avg_pitch_target)  # [B, 1, T_src] --> [B, C_hidden, T_src]
+        return pitch_pred, avg_pitch_target, pitch_emb
+
+    def get_pitch_embedding(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pitch_transform: Callable,
+        pitch_mean: torch.Tensor,
+        pitch_std: torch.Tensor,
+    ) -> torch.Tensor:
+        pitch_pred = self.pitch_predictor(x, mask)
+        if pitch_transform is not None:
+            pitch_pred = pitch_transform(pitch_pred, (~mask).sum(), pitch_mean, pitch_std)
+        pitch_pred.unsqueeze_(1)
+        pitch_emb_pred = self.pitch_emb(pitch_pred)
+        return pitch_emb_pred, pitch_pred
--- a/TTS/tts/layers/delightful_tts/variance_predictor.py
+++ b/TTS/tts/layers/delightful_tts/variance_predictor.py
@ -0,0 +1,68 @@
+import torch
+import torch.nn as nn  # pylint: disable=consider-using-from-import
+
+from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed
+
+
+class VariancePredictor(nn.Module):
+    """
+    Network is 2-layer 1D convolutions with leaky relu activation and then
+    followed by layer normalization then a dropout layer and finally an
+    extra linear layer to project the hidden states into the output sequence.
+
+    Args:
+        channels_in (int): Number of in channels for conv layers.
+        channels_out (int): Number of out channels for the last linear layer.
+        kernel_size (int): Size the kernel for the conv layers.
+        p_dropout (float): Probability of dropout.
+        lrelu_slope (float): Slope for the leaky relu.
+
+    Inputs: inputs, mask
+        - **inputs** (batch, time, dim): Tensor containing input vector
+        - **mask** (batch, time): Tensor containing indices to be masked
+    Returns:
+        - **outputs** (batch, time): Tensor produced by last linear layer.
+    """
+
+    def __init__(
+        self, channels_in: int, channels: int, channels_out: int, kernel_size: int, p_dropout: float, lrelu_slope: float
+    ):
+        super().__init__()
+
+        self.layers = nn.ModuleList(
+            [
+                ConvTransposed(
+                    channels_in,
+                    channels,
+                    kernel_size=kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                ),
+                nn.LeakyReLU(lrelu_slope),
+                nn.LayerNorm(channels),
+                nn.Dropout(p_dropout),
+                ConvTransposed(
+                    channels,
+                    channels,
+                    kernel_size=kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                ),
+                nn.LeakyReLU(lrelu_slope),
+                nn.LayerNorm(channels),
+                nn.Dropout(p_dropout),
+            ]
+        )
+
+        self.linear_layer = nn.Linear(channels, channels_out)
+
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        Shapes:
+            x: :math: `[B, T_src, C]`
+            mask: :math: `[B, T_src]`
+        """
+        for layer in self.layers:
+            x = layer(x)
+        x = self.linear_layer(x)
+        x = x.squeeze(-1)
+        x = x.masked_fill(mask, 0.0)
+        return x
--- a/TTS/tts/layers/feed_forward/decoder.py
+++ b/TTS/tts/layers/feed_forward/decoder.py
@ -81,7 +81,6 @@ class RelativePositionTransformerDecoder(nn.Module):
    """

    def __init__(self, in_channels, out_channels, hidden_channels, params):
-
        super().__init__()
        self.prenet = Conv1dBN(in_channels, hidden_channels, 1, 1)
        self.rel_pos_transformer = RelativePositionTransformer(in_channels, out_channels, hidden_channels, **params)
@ -111,7 +110,6 @@ class FFTransformerDecoder(nn.Module):
    """

    def __init__(self, in_channels, out_channels, params):
-
        super().__init__()
        self.transformer_block = FFTransformerBlock(in_channels, **params)
        self.postnet = nn.Conv1d(in_channels, out_channels, 1)
--- a/TTS/tts/layers/feed_forward/duration_predictor.py
+++ b/TTS/tts/layers/feed_forward/duration_predictor.py
@ -18,7 +18,6 @@ class DurationPredictor(nn.Module):
    """

    def __init__(self, hidden_channels):
-
        super().__init__()

        self.layers = nn.ModuleList(
--- a/TTS/tts/layers/generic/aligner.py
+++ b/TTS/tts/layers/generic/aligner.py
@ -57,6 +57,15 @@ class AlignmentNetwork(torch.nn.Module):
            nn.Conv1d(in_query_channels, attn_channels, kernel_size=1, padding=0, bias=True),
        )

+        self.init_layers()
+
+    def init_layers(self):
+        torch.nn.init.xavier_uniform_(self.key_layer[0].weight, gain=torch.nn.init.calculate_gain("relu"))
+        torch.nn.init.xavier_uniform_(self.key_layer[2].weight, gain=torch.nn.init.calculate_gain("linear"))
+        torch.nn.init.xavier_uniform_(self.query_layer[0].weight, gain=torch.nn.init.calculate_gain("relu"))
+        torch.nn.init.xavier_uniform_(self.query_layer[2].weight, gain=torch.nn.init.calculate_gain("linear"))
+        torch.nn.init.xavier_uniform_(self.query_layer[4].weight, gain=torch.nn.init.calculate_gain("linear"))
+
    def forward(
        self, queries: torch.tensor, keys: torch.tensor, mask: torch.tensor = None, attn_prior: torch.tensor = None
    ) -> Tuple[torch.tensor, torch.tensor]:
@ -75,7 +84,9 @@ class AlignmentNetwork(torch.nn.Module):
        attn_logp = -self.temperature * attn_factor.sum(1, keepdim=True)
        if attn_prior is not None:
            attn_logp = self.log_softmax(attn_logp) + torch.log(attn_prior[:, None] + 1e-8)
+
        if mask is not None:
            attn_logp.data.masked_fill_(~mask.bool().unsqueeze(2), -float("inf"))
+
        attn = self.softmax(attn_logp)
        return attn, attn_logp
--- a/TTS/tts/layers/generic/res_conv_bn.py
+++ b/TTS/tts/layers/generic/res_conv_bn.py
@ -100,7 +100,6 @@ class ResidualConv1dBNBlock(nn.Module):
    def __init__(
        self, in_channels, out_channels, hidden_channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2
    ):
-
        super().__init__()
        assert len(dilations) == num_res_blocks
        self.res_blocks = nn.ModuleList()
--- a/TTS/tts/layers/generic/wavenet.py
+++ b/TTS/tts/layers/generic/wavenet.py
@ -1,5 +1,6 @@
 import torch
 from torch import nn
+from torch.nn.utils import parametrize


@torch.jit.script
@ -62,7 +63,7 @@ class WN(torch.nn.Module):
        # init conditioning layer
        if c_in_channels > 0:
            cond_layer = torch.nn.Conv1d(c_in_channels, 2 * hidden_channels * num_layers, 1)
-            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+            self.cond_layer = torch.nn.utils.parametrizations.weight_norm(cond_layer, name="weight")
        # intermediate layers
        for i in range(num_layers):
            dilation = dilation_rate**i
@ -75,7 +76,7 @@ class WN(torch.nn.Module):
                in_layer = torch.nn.Conv1d(
                    hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding
                )
-            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight")
            self.in_layers.append(in_layer)

            if i < num_layers - 1:
@ -84,7 +85,7 @@ class WN(torch.nn.Module):
                res_skip_channels = hidden_channels

            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
-            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+            res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight")
            self.res_skip_layers.append(res_skip_layer)
        # setup weight norm
        if not weight_norm:
@ -115,11 +116,11 @@ class WN(torch.nn.Module):

    def remove_weight_norm(self):
        if self.c_in_channels != 0:
-            torch.nn.utils.remove_weight_norm(self.cond_layer)
+            parametrize.remove_parametrizations(self.cond_layer, "weight")
        for l in self.in_layers:
-            torch.nn.utils.remove_weight_norm(l)
+            parametrize.remove_parametrizations(l, "weight")
        for l in self.res_skip_layers:
-            torch.nn.utils.remove_weight_norm(l)
+            parametrize.remove_parametrizations(l, "weight")


 class WNBlocks(nn.Module):
@ -153,7 +154,6 @@ class WNBlocks(nn.Module):
        dropout_p=0,
        weight_norm=True,
    ):
-
        super().__init__()
        self.wn_blocks = nn.ModuleList()
        for idx in range(num_blocks):
--- a/TTS/tts/layers/glow_tts/glow.py
+++ b/TTS/tts/layers/glow_tts/glow.py
@ -1,6 +1,5 @@
-from distutils.version import LooseVersion
-
 import torch
+from packaging.version import Version
 from torch import nn
 from torch.nn import functional as F

@ -91,7 +90,7 @@ class InvConvNear(nn.Module):
        self.no_jacobian = no_jacobian
        self.weight_inv = None

-        if LooseVersion(torch.__version__) < LooseVersion("1.9"):
+        if Version(torch.__version__) < Version("1.9"):
            w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0]
        else:
            w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0]
@ -187,7 +186,7 @@ class CouplingBlock(nn.Module):
        self.sigmoid_scale = sigmoid_scale
        # input layer
        start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1)
-        start = torch.nn.utils.weight_norm(start)
+        start = torch.nn.utils.parametrizations.weight_norm(start)
        self.start = start
        # output layer
        # Initializing last layer to 0 makes the affine coupling layers
--- a/TTS/tts/layers/glow_tts/transformer.py
+++ b/TTS/tts/layers/glow_tts/transformer.py
@ -64,7 +64,6 @@ class RelativePositionMultiHeadAttention(nn.Module):
        proximal_bias=False,
        proximal_init=False,
    ):
-
        super().__init__()
        assert channels % num_heads == 0, " [!] channels should be divisible by num_heads."
        # class attributes
@ -272,7 +271,6 @@ class FeedForwardNetwork(nn.Module):
    """

    def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dropout_p=0.0, causal=False):
-
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@ -165,7 +165,7 @@ class BCELossMasked(nn.Module):

    def __init__(self, pos_weight: float = None):
        super().__init__()
-        self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False)
+        self.register_buffer("pos_weight", torch.tensor([pos_weight]))

    def forward(self, x, target, length):
        """
@ -191,16 +191,21 @@ class BCELossMasked(nn.Module):
            mask = sequence_mask(sequence_length=length, max_len=target.size(1))
            num_items = mask.sum()
            loss = functional.binary_cross_entropy_with_logits(
-                x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum"
+                x.masked_select(mask),
+                target.masked_select(mask),
+                pos_weight=self.pos_weight.to(x.device),
+                reduction="sum",
            )
        else:
-            loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
+            loss = functional.binary_cross_entropy_with_logits(
+                x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum"
+            )
            num_items = torch.numel(x)
        loss = loss / num_items
        return loss


-class DifferentailSpectralLoss(nn.Module):
+class DifferentialSpectralLoss(nn.Module):
    """Differential Spectral Loss
    https://arxiv.org/ftp/arxiv/papers/1909/1909.10302.pdf"""

@ -335,7 +340,7 @@ class TacotronLoss(torch.nn.Module):
            self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma)
        # differential spectral loss
        if c.postnet_diff_spec_alpha > 0 or c.decoder_diff_spec_alpha > 0:
-            self.criterion_diff_spec = DifferentailSpectralLoss(loss_func=self.criterion)
+            self.criterion_diff_spec = DifferentialSpectralLoss(loss_func=self.criterion)
        # ssim loss
        if c.postnet_ssim_alpha > 0 or c.decoder_ssim_alpha > 0:
            self.criterion_ssim = SSIMLoss()
@ -363,7 +368,6 @@ class TacotronLoss(torch.nn.Module):
        alignments_backwards,
        input_lens,
    ):
-
        # decoder outputs linear or mel spectrograms for Tacotron and Tacotron2
        # the target should be set acccordingly
        postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input
@ -801,6 +805,10 @@ class ForwardTTSLoss(nn.Module):
            self.pitch_loss = MSELossMasked(False)
            self.pitch_loss_alpha = c.pitch_loss_alpha

+        if c.model_args.use_energy:
+            self.energy_loss = MSELossMasked(False)
+            self.energy_loss_alpha = c.energy_loss_alpha
+
        if c.use_ssim_loss:
            self.ssim = SSIMLoss() if c.use_ssim_loss else None
            self.ssim_loss_alpha = c.ssim_loss_alpha
@ -826,6 +834,8 @@ class ForwardTTSLoss(nn.Module):
        dur_target,
        pitch_output,
        pitch_target,
+        energy_output,
+        energy_target,
        input_lens,
        alignment_logprob=None,
        alignment_hard=None,
@ -855,6 +865,11 @@ class ForwardTTSLoss(nn.Module):
            loss = loss + self.pitch_loss_alpha * pitch_loss
            return_dict["loss_pitch"] = self.pitch_loss_alpha * pitch_loss

+        if hasattr(self, "energy_loss") and self.energy_loss_alpha > 0:
+            energy_loss = self.energy_loss(energy_output.transpose(1, 2), energy_target.transpose(1, 2), input_lens)
+            loss = loss + self.energy_loss_alpha * energy_loss
+            return_dict["loss_energy"] = self.energy_loss_alpha * energy_loss
+
        if hasattr(self, "aligner_loss") and self.aligner_loss_alpha > 0:
            aligner_loss = self.aligner_loss(alignment_logprob, input_lens, decoder_output_lens)
            loss = loss + self.aligner_loss_alpha * aligner_loss
--- a/TTS/tts/layers/overflow/init.py
+++ b/TTS/tts/layers/overflow/init.py
--- a/TTS/tts/layers/overflow/common_layers.py
+++ b/TTS/tts/layers/overflow/common_layers.py
@ -0,0 +1,323 @@
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm.auto import tqdm
+
+from TTS.tts.layers.tacotron.common_layers import Linear
+from TTS.tts.layers.tacotron.tacotron2 import ConvBNBlock
+
+
+class Encoder(nn.Module):
+    r"""Neural HMM Encoder
+
+    Same as Tacotron 2 encoder but increases the input length by states per phone
+
+    Args:
+        num_chars (int): Number of characters in the input.
+        state_per_phone (int): Number of states per phone.
+        in_out_channels (int): number of input and output channels.
+        n_convolutions (int): number of convolutional layers.
+    """
+
+    def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutions=3):
+        super().__init__()
+
+        self.state_per_phone = state_per_phone
+        self.in_out_channels = in_out_channels
+
+        self.emb = nn.Embedding(num_chars, in_out_channels)
+        self.convolutions = nn.ModuleList()
+        for _ in range(n_convolutions):
+            self.convolutions.append(ConvBNBlock(in_out_channels, in_out_channels, 5, "relu"))
+        self.lstm = nn.LSTM(
+            in_out_channels,
+            int(in_out_channels / 2) * state_per_phone,
+            num_layers=1,
+            batch_first=True,
+            bias=True,
+            bidirectional=True,
+        )
+        self.rnn_state = None
+
+    def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+        """Forward pass to the encoder.
+
+        Args:
+            x (torch.FloatTensor): input text indices.
+                - shape: :math:`(b, T_{in})`
+            x_len (torch.LongTensor): input text lengths.
+                - shape: :math:`(b,)`
+
+        Returns:
+            Tuple[torch.FloatTensor, torch.LongTensor]: encoder outputs and output lengths.
+                -shape: :math:`((b, T_{in} * states_per_phone, in_out_channels), (b,))`
+        """
+        b, T = x.shape
+        o = self.emb(x).transpose(1, 2)
+        for layer in self.convolutions:
+            o = layer(o)
+        o = o.transpose(1, 2)
+        o = nn.utils.rnn.pack_padded_sequence(o, x_len.cpu(), batch_first=True)
+        self.lstm.flatten_parameters()
+        o, _ = self.lstm(o)
+        o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True)
+        o = o.reshape(b, T * self.state_per_phone, self.in_out_channels)
+        x_len = x_len * self.state_per_phone
+        return o, x_len
+
+    def inference(self, x, x_len):
+        """Inference to the encoder.
+
+        Args:
+            x (torch.FloatTensor): input text indices.
+                - shape: :math:`(b, T_{in})`
+            x_len (torch.LongTensor): input text lengths.
+                - shape: :math:`(b,)`
+
+        Returns:
+            Tuple[torch.FloatTensor, torch.LongTensor]: encoder outputs and output lengths.
+                -shape: :math:`((b, T_{in} * states_per_phone, in_out_channels), (b,))`
+        """
+        b, T = x.shape
+        o = self.emb(x).transpose(1, 2)
+        for layer in self.convolutions:
+            o = layer(o)
+        o = o.transpose(1, 2)
+        # self.lstm.flatten_parameters()
+        o, _ = self.lstm(o)
+        o = o.reshape(b, T * self.state_per_phone, self.in_out_channels)
+        x_len = x_len * self.state_per_phone
+        return o, x_len
+
+
+class ParameterModel(nn.Module):
+    r"""Main neural network of the outputnet
+
+    Note: Do not put dropout layers here, the model will not converge.
+
+    Args:
+            outputnet_size (List[int]): the architecture of the parameter model
+            input_size (int): size of input for the first layer
+            output_size (int): size of output i.e size of the feature dim
+            frame_channels (int): feature dim to set the flat start bias
+            flat_start_params (dict): flat start parameters to set the bias
+    """
+
+    def __init__(
+        self,
+        outputnet_size: List[int],
+        input_size: int,
+        output_size: int,
+        frame_channels: int,
+        flat_start_params: dict,
+    ):
+        super().__init__()
+        self.frame_channels = frame_channels
+
+        self.layers = nn.ModuleList(
+            [Linear(inp, out) for inp, out in zip([input_size] + outputnet_size[:-1], outputnet_size)]
+        )
+        self.last_layer = nn.Linear(outputnet_size[-1], output_size)
+        self.flat_start_output_layer(
+            flat_start_params["mean"], flat_start_params["std"], flat_start_params["transition_p"]
+        )
+
+    def flat_start_output_layer(self, mean, std, transition_p):
+        self.last_layer.weight.data.zero_()
+        self.last_layer.bias.data[0 : self.frame_channels] = mean
+        self.last_layer.bias.data[self.frame_channels : 2 * self.frame_channels] = OverflowUtils.inverse_softplus(std)
+        self.last_layer.bias.data[2 * self.frame_channels :] = OverflowUtils.inverse_sigmod(transition_p)
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = F.relu(layer(x))
+        x = self.last_layer(x)
+        return x
+
+
+class Outputnet(nn.Module):
+    r"""
+    This network takes current state and previous observed values as input
+    and returns its parameters, mean, standard deviation and probability
+    of transition to the next state
+    """
+
+    def __init__(
+        self,
+        encoder_dim: int,
+        memory_rnn_dim: int,
+        frame_channels: int,
+        outputnet_size: List[int],
+        flat_start_params: dict,
+        std_floor: float = 1e-2,
+    ):
+        super().__init__()
+
+        self.frame_channels = frame_channels
+        self.flat_start_params = flat_start_params
+        self.std_floor = std_floor
+
+        input_size = memory_rnn_dim + encoder_dim
+        output_size = 2 * frame_channels + 1
+
+        self.parametermodel = ParameterModel(
+            outputnet_size=outputnet_size,
+            input_size=input_size,
+            output_size=output_size,
+            flat_start_params=flat_start_params,
+            frame_channels=frame_channels,
+        )
+
+    def forward(self, ar_mels, inputs):
+        r"""Inputs observation and returns the means, stds and transition probability for the current state
+
+        Args:
+            ar_mel_inputs (torch.FloatTensor): shape (batch, prenet_dim)
+            states (torch.FloatTensor):  (batch, hidden_states, hidden_state_dim)
+
+        Returns:
+            means: means for the emission observation for each feature
+                - shape: (B, hidden_states, feature_size)
+            stds: standard deviations for the emission observation for each feature
+                - shape: (batch, hidden_states, feature_size)
+            transition_vectors: transition vector for the current hidden state
+                - shape: (batch, hidden_states)
+        """
+        batch_size, prenet_dim = ar_mels.shape[0], ar_mels.shape[1]
+        N = inputs.shape[1]
+
+        ar_mels = ar_mels.unsqueeze(1).expand(batch_size, N, prenet_dim)
+        ar_mels = torch.cat((ar_mels, inputs), dim=2)
+        ar_mels = self.parametermodel(ar_mels)
+
+        mean, std, transition_vector = (
+            ar_mels[:, :, 0 : self.frame_channels],
+            ar_mels[:, :, self.frame_channels : 2 * self.frame_channels],
+            ar_mels[:, :, 2 * self.frame_channels :].squeeze(2),
+        )
+        std = F.softplus(std)
+        std = self._floor_std(std)
+        return mean, std, transition_vector
+
+    def _floor_std(self, std):
+        r"""
+        It clamps the standard deviation to not to go below some level
+        This removes the problem when the model tries to cheat for higher likelihoods by converting
+        one of the gaussians to a point mass.
+
+        Args:
+            std (float Tensor): tensor containing the standard deviation to be
+        """
+        original_tensor = std.clone().detach()
+        std = torch.clamp(std, min=self.std_floor)
+        if torch.any(original_tensor != std):
+            print(
+                "[*] Standard deviation was floored! The model is preventing overfitting, nothing serious to worry about"
+            )
+        return std
+
+
+class OverflowUtils:
+    @staticmethod
+    def get_data_parameters_for_flat_start(
+        data_loader: torch.utils.data.DataLoader, out_channels: int, states_per_phone: int
+    ):
+        """Generates data parameters for flat starting the HMM.
+
+        Args:
+            data_loader (torch.utils.data.Dataloader): _description_
+            out_channels (int): mel spectrogram channels
+            states_per_phone (_type_): HMM states per phone
+        """
+
+        # State related information for transition_p
+        total_state_len = 0
+        total_mel_len = 0
+
+        # Useful for data mean an std
+        total_mel_sum = 0
+        total_mel_sq_sum = 0
+
+        for batch in tqdm(data_loader, leave=False):
+            text_lengths = batch["token_id_lengths"]
+            mels = batch["mel"]
+            mel_lengths = batch["mel_lengths"]
+
+            total_state_len += torch.sum(text_lengths)
+            total_mel_len += torch.sum(mel_lengths)
+            total_mel_sum += torch.sum(mels)
+            total_mel_sq_sum += torch.sum(torch.pow(mels, 2))
+
+        data_mean = total_mel_sum / (total_mel_len * out_channels)
+        data_std = torch.sqrt((total_mel_sq_sum / (total_mel_len * out_channels)) - torch.pow(data_mean, 2))
+        average_num_states = total_state_len / len(data_loader.dataset)
+        average_mel_len = total_mel_len / len(data_loader.dataset)
+        average_duration_each_state = average_mel_len / average_num_states
+        init_transition_prob = 1 / average_duration_each_state
+
+        return data_mean, data_std, (init_transition_prob * states_per_phone)
+
+    @staticmethod
+    @torch.no_grad()
+    def update_flat_start_transition(model, transition_p):
+        model.neural_hmm.output_net.parametermodel.flat_start_output_layer(0.0, 1.0, transition_p)
+
+    @staticmethod
+    def log_clamped(x, eps=1e-04):
+        """
+        Avoids the log(0) problem
+
+        Args:
+            x (torch.tensor): input tensor
+            eps (float, optional): lower bound. Defaults to 1e-04.
+
+        Returns:
+            torch.tensor: :math:`log(x)`
+        """
+        clamped_x = torch.clamp(x, min=eps)
+        return torch.log(clamped_x)
+
+    @staticmethod
+    def inverse_sigmod(x):
+        r"""
+        Inverse of the sigmoid function
+        """
+        if not torch.is_tensor(x):
+            x = torch.tensor(x)
+        return OverflowUtils.log_clamped(x / (1.0 - x))
+
+    @staticmethod
+    def inverse_softplus(x):
+        r"""
+        Inverse of the softplus function
+        """
+        if not torch.is_tensor(x):
+            x = torch.tensor(x)
+        return OverflowUtils.log_clamped(torch.exp(x) - 1.0)
+
+    @staticmethod
+    def logsumexp(x, dim):
+        r"""
+        Differentiable LogSumExp: Does not creates nan gradients
+            when all the inputs are -inf yeilds 0 gradients.
+        Args:
+            x : torch.Tensor -  The input tensor
+            dim: int - The dimension on which the log sum exp has to be applied
+        """
+
+        m, _ = x.max(dim=dim)
+        mask = m == -float("inf")
+        s = (x - m.masked_fill_(mask, 0).unsqueeze(dim=dim)).exp().sum(dim=dim)
+        return s.masked_fill_(mask, 1).log() + m.masked_fill_(mask, -float("inf"))
+
+    @staticmethod
+    def double_pad(list_of_different_shape_tensors):
+        r"""
+        Pads the list of tensors in 2 dimensions
+        """
+        second_dim_lens = [len(a) for a in [i[0] for i in list_of_different_shape_tensors]]
+        second_dim_max = max(second_dim_lens)
+        padded_x = [F.pad(x, (0, second_dim_max - len(x[0]))) for x in list_of_different_shape_tensors]
+        return nn.utils.rnn.pad_sequence(padded_x, batch_first=True)
--- a/TTS/tts/layers/overflow/decoder.py
+++ b/TTS/tts/layers/overflow/decoder.py
@ -0,0 +1,81 @@
+import torch
+from torch import nn
+
+from TTS.tts.layers.glow_tts.decoder import Decoder as GlowDecoder
+from TTS.tts.utils.helpers import sequence_mask
+
+
+class Decoder(nn.Module):
+    """Uses glow decoder with some modifications.
+    ::
+
+        Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze
+
+    Args:
+        in_channels (int): channels of input tensor.
+        hidden_channels (int): hidden decoder channels.
+        kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)
+        dilation_rate (int): rate to increase dilation by each layer in a decoder block.
+        num_flow_blocks (int): number of decoder blocks.
+        num_coupling_layers (int): number coupling layers. (number of wavenet layers.)
+        dropout_p (float): wavenet dropout rate.
+        sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        num_flow_blocks,
+        num_coupling_layers,
+        dropout_p=0.0,
+        num_splits=4,
+        num_squeeze=2,
+        sigmoid_scale=False,
+        c_in_channels=0,
+    ):
+        super().__init__()
+
+        self.glow_decoder = GlowDecoder(
+            in_channels,
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            num_flow_blocks,
+            num_coupling_layers,
+            dropout_p,
+            num_splits,
+            num_squeeze,
+            sigmoid_scale,
+            c_in_channels,
+        )
+        self.n_sqz = num_squeeze
+
+    def forward(self, x, x_len, g=None, reverse=False):
+        """
+        Input shapes:
+            - x:  :math:`[B, C, T]`
+            - x_len :math:`[B]`
+            - g: :math:`[B, C]`
+
+        Output shapes:
+            - x:  :math:`[B, C, T]`
+            - x_len :math:`[B]`
+            - logget_tot :math:`[B]`
+        """
+        x, x_len, x_max_len = self.preprocess(x, x_len, x_len.max())
+        x_mask = torch.unsqueeze(sequence_mask(x_len, x_max_len), 1).to(x.dtype)
+        x, logdet_tot = self.glow_decoder(x, x_mask, g, reverse)
+        return x, x_len, logdet_tot
+
+    def preprocess(self, y, y_lengths, y_max_length):
+        if y_max_length is not None:
+            y_max_length = torch.div(y_max_length, self.n_sqz, rounding_mode="floor") * self.n_sqz
+            y = y[:, :, :y_max_length]
+        y_lengths = torch.div(y_lengths, self.n_sqz, rounding_mode="floor") * self.n_sqz
+        return y, y_lengths, y_max_length
+
+    def store_inverse(self):
+        self.glow_decoder.store_inverse()
--- a/TTS/tts/layers/overflow/neural_hmm.py
+++ b/TTS/tts/layers/overflow/neural_hmm.py
@ -0,0 +1,553 @@
+from typing import List
+
+import torch
+import torch.distributions as tdist
+import torch.nn.functional as F
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+from TTS.tts.layers.overflow.common_layers import Outputnet, OverflowUtils
+from TTS.tts.layers.tacotron.common_layers import Prenet
+from TTS.tts.utils.helpers import sequence_mask
+
+
+class NeuralHMM(nn.Module):
+    """Autoregressive left to right HMM model primarily used in "Neural HMMs are all you need (for high-quality attention-free TTS)"
+
+    Paper::
+        https://arxiv.org/abs/2108.13320
+
+    Paper abstract::
+        Neural sequence-to-sequence TTS has achieved significantly better output quality than statistical speech synthesis using
+        HMMs. However, neural TTS is generally not probabilistic and uses non-monotonic attention. Attention failures increase
+        training time and can make synthesis babble incoherently. This paper describes how the old and new paradigms can be
+        combined to obtain the advantages of both worlds, by replacing attention in neural TTS with an autoregressive left-right
+        no-skip hidden Markov model defined by a neural network. Based on this proposal, we modify Tacotron 2 to obtain an
+        HMM-based neural TTS model with monotonic alignment, trained to maximise the full sequence likelihood without
+        approximation. We also describe how to combine ideas from classical and contemporary TTS for best results. The resulting
+        example system is smaller and simpler than Tacotron 2, and learns to speak with fewer iterations and less data, whilst
+        achieving comparable naturalness prior to the post-net. Our approach also allows easy control over speaking rate.
+
+    Args:
+        frame_channels (int): Output dimension to generate.
+        ar_order (int): Autoregressive order of the model. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
+        deterministic_transition (bool): deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
+        encoder_dim (int): Channels of encoder input and character embedding tensors. Defaults to 512.
+        prenet_type (str): `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the Prenet.
+        prenet_dim (int): Dimension of the Prenet.
+        prenet_n_layers (int): Number of layers in the Prenet.
+        prenet_dropout (float): Dropout probability of the Prenet.
+        prenet_dropout_at_inference (bool): If True, dropout is applied at inference time.
+        memory_rnn_dim (int): Size of the memory RNN to process output of prenet.
+        outputnet_size (List[int]): Size of the output network inside the neural HMM.
+        flat_start_params (dict): Parameters for the flat start initialization of the neural HMM.
+        std_floor (float): Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint.
+        use_grad_checkpointing (bool, optional): Use gradient checkpointing to save memory. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        frame_channels: int,
+        ar_order: int,
+        deterministic_transition: bool,
+        encoder_dim: int,
+        prenet_type: str,
+        prenet_dim: int,
+        prenet_n_layers: int,
+        prenet_dropout: float,
+        prenet_dropout_at_inference: bool,
+        memory_rnn_dim: int,
+        outputnet_size: List[int],
+        flat_start_params: dict,
+        std_floor: float,
+        use_grad_checkpointing: bool = True,
+    ):
+        super().__init__()
+
+        self.frame_channels = frame_channels
+        self.ar_order = ar_order
+        self.deterministic_transition = deterministic_transition
+        self.prenet_dim = prenet_dim
+        self.memory_rnn_dim = memory_rnn_dim
+        self.use_grad_checkpointing = use_grad_checkpointing
+
+        self.transition_model = TransitionModel()
+        self.emission_model = EmissionModel()
+
+        assert ar_order > 0, f"AR order must be greater than 0 provided {ar_order}"
+
+        self.ar_order = ar_order
+        self.prenet = Prenet(
+            in_features=frame_channels * ar_order,
+            prenet_type=prenet_type,
+            prenet_dropout=prenet_dropout,
+            dropout_at_inference=prenet_dropout_at_inference,
+            out_features=[self.prenet_dim for _ in range(prenet_n_layers)],
+            bias=False,
+        )
+        self.memory_rnn = nn.LSTMCell(input_size=prenet_dim, hidden_size=memory_rnn_dim)
+        self.output_net = Outputnet(
+            encoder_dim, memory_rnn_dim, frame_channels, outputnet_size, flat_start_params, std_floor
+        )
+        self.register_buffer("go_tokens", torch.zeros(ar_order, 1))
+
+    def forward(self, inputs, inputs_len, mels, mel_lens):
+        r"""HMM forward algorithm for training uses logarithmic version of Rabiner (1989) forward algorithm.
+
+        Args:
+            inputs (torch.FloatTensor): Encoder outputs
+            inputs_len (torch.LongTensor): Encoder output lengths
+            mels (torch.FloatTensor): Mel inputs
+            mel_lens (torch.LongTensor): Length of mel inputs
+
+        Shapes:
+            - inputs: (B, T, D_out_enc)
+            - inputs_len: (B)
+            - mels: (B, D_mel, T_mel)
+            - mel_lens: (B)
+
+        Returns:
+            log_prob (torch.FloatTensor): Log probability of the sequence
+        """
+        # Get dimensions of inputs
+        batch_size, N, _ = inputs.shape
+        T_max = torch.max(mel_lens)
+        mels = mels.permute(0, 2, 1)
+
+        # Intialize forward algorithm
+        log_state_priors = self._initialize_log_state_priors(inputs)
+        log_c, log_alpha_scaled, transition_matrix, means = self._initialize_forward_algorithm_variables(mels, N)
+
+        # Initialize autoregression elements
+        ar_inputs = self._add_go_token(mels)
+        h_memory, c_memory = self._init_lstm_states(batch_size, self.memory_rnn_dim, mels)
+
+        for t in range(T_max):
+            # Process Autoregression
+            h_memory, c_memory = self._process_ar_timestep(t, ar_inputs, h_memory, c_memory)
+            # Get mean, std and transition vector from decoder for this timestep
+            # Note: Gradient checkpointing currently doesn't works with multiple gpus inside a loop
+            if self.use_grad_checkpointing and self.training:
+                mean, std, transition_vector = checkpoint(self.output_net, h_memory, inputs)
+            else:
+                mean, std, transition_vector = self.output_net(h_memory, inputs)
+
+            if t == 0:
+                log_alpha_temp = log_state_priors + self.emission_model(mels[:, 0], mean, std, inputs_len)
+            else:
+                log_alpha_temp = self.emission_model(mels[:, t], mean, std, inputs_len) + self.transition_model(
+                    log_alpha_scaled[:, t - 1, :], transition_vector, inputs_len
+                )
+            log_c[:, t] = torch.logsumexp(log_alpha_temp, dim=1)
+            log_alpha_scaled[:, t, :] = log_alpha_temp - log_c[:, t].unsqueeze(1)
+            transition_matrix[:, t] = transition_vector  # needed for absorption state calculation
+
+            # Save for plotting
+            means.append(mean.detach())
+
+        log_c, log_alpha_scaled = self._mask_lengths(mel_lens, log_c, log_alpha_scaled)
+
+        sum_final_log_c = self.get_absorption_state_scaling_factor(
+            mel_lens, log_alpha_scaled, inputs_len, transition_matrix
+        )
+
+        log_probs = torch.sum(log_c, dim=1) + sum_final_log_c
+
+        return log_probs, log_alpha_scaled, transition_matrix, means
+
+    @staticmethod
+    def _mask_lengths(mel_lens, log_c, log_alpha_scaled):
+        """
+        Mask the lengths of the forward variables so that the variable lenghts
+        do not contribute in the loss calculation
+        Args:
+            mel_inputs (torch.FloatTensor): (batch, T, frame_channels)
+            mel_inputs_lengths (torch.IntTensor): (batch)
+            log_c (torch.FloatTensor): (batch, T)
+        Returns:
+            log_c (torch.FloatTensor) : scaled probabilities (batch, T)
+            log_alpha_scaled (torch.FloatTensor): forward probabilities (batch, T, N)
+        """
+        mask_log_c = sequence_mask(mel_lens)
+        log_c = log_c * mask_log_c
+        mask_log_alpha_scaled = mask_log_c.unsqueeze(2)
+        log_alpha_scaled = log_alpha_scaled * mask_log_alpha_scaled
+        return log_c, log_alpha_scaled
+
+    def _process_ar_timestep(
+        self,
+        t,
+        ar_inputs,
+        h_memory,
+        c_memory,
+    ):
+        """
+        Process autoregression in timestep
+        1. At a specific t timestep
+        2. Perform data dropout if applied (we did not use it)
+        3. Run the autoregressive frame through the prenet (has dropout)
+        4. Run the prenet output through the post prenet rnn
+
+        Args:
+            t (int): mel-spec timestep
+            ar_inputs (torch.FloatTensor): go-token appended mel-spectrograms
+                - shape: (b, D_out, T_out)
+            h_post_prenet (torch.FloatTensor): previous timestep rnn hidden state
+                - shape: (b, memory_rnn_dim)
+            c_post_prenet (torch.FloatTensor): previous timestep rnn cell state
+                - shape: (b, memory_rnn_dim)
+
+        Returns:
+            h_post_prenet (torch.FloatTensor): rnn hidden state of the current timestep
+            c_post_prenet (torch.FloatTensor): rnn cell state of the current timestep
+        """
+        prenet_input = ar_inputs[:, t : t + self.ar_order].flatten(1)
+        memory_inputs = self.prenet(prenet_input)
+        h_memory, c_memory = self.memory_rnn(memory_inputs, (h_memory, c_memory))
+        return h_memory, c_memory
+
+    def _add_go_token(self, mel_inputs):
+        """Append the go token to create the autoregressive input
+        Args:
+            mel_inputs (torch.FloatTensor): (batch_size, T, n_mel_channel)
+        Returns:
+            ar_inputs (torch.FloatTensor): (batch_size, T, n_mel_channel)
+        """
+        batch_size, T, _ = mel_inputs.shape
+        go_tokens = self.go_tokens.unsqueeze(0).expand(batch_size, self.ar_order, self.frame_channels)
+        ar_inputs = torch.cat((go_tokens, mel_inputs), dim=1)[:, :T]
+        return ar_inputs
+
+    @staticmethod
+    def _initialize_forward_algorithm_variables(mel_inputs, N):
+        r"""Initialize placeholders for forward algorithm variables, to use a stable
+                version we will use log_alpha_scaled and the scaling constant
+
+        Args:
+            mel_inputs (torch.FloatTensor): (b, T_max, frame_channels)
+            N (int): number of states
+        Returns:
+            log_c (torch.FloatTensor): Scaling constant (b, T_max)
+        """
+        b, T_max, _ = mel_inputs.shape
+        log_alpha_scaled = mel_inputs.new_zeros((b, T_max, N))
+        log_c = mel_inputs.new_zeros(b, T_max)
+        transition_matrix = mel_inputs.new_zeros((b, T_max, N))
+
+        # Saving for plotting later, will not have gradient tapes
+        means = []
+        return log_c, log_alpha_scaled, transition_matrix, means
+
+    @staticmethod
+    def _init_lstm_states(batch_size, hidden_state_dim, device_tensor):
+        r"""
+        Initialize Hidden and Cell states for LSTM Cell
+
+        Args:
+            batch_size (Int): batch size
+            hidden_state_dim (Int): dimensions of the h and c
+            device_tensor (torch.FloatTensor): useful for the device and type
+
+        Returns:
+            (torch.FloatTensor): shape (batch_size, hidden_state_dim)
+                can be hidden state for LSTM
+            (torch.FloatTensor): shape (batch_size, hidden_state_dim)
+                can be the cell state for LSTM
+        """
+        return (
+            device_tensor.new_zeros(batch_size, hidden_state_dim),
+            device_tensor.new_zeros(batch_size, hidden_state_dim),
+        )
+
+    def get_absorption_state_scaling_factor(self, mels_len, log_alpha_scaled, inputs_len, transition_vector):
+        """Returns the final scaling factor of absorption state
+
+        Args:
+            mels_len (torch.IntTensor): Input size of mels to
+                    get the last timestep of log_alpha_scaled
+            log_alpha_scaled (torch.FloatTEnsor): State probabilities
+            text_lengths (torch.IntTensor): length of the states to
+                    mask the values of states lengths
+                (
+                    Useful when the batch has very different lengths,
+                    when the length of an observation is less than
+                    the number of max states, then the log alpha after
+                    the state value is filled with -infs. So we mask
+                    those values so that it only consider the states
+                    which are needed for that length
+                )
+            transition_vector (torch.FloatTensor): transtiion vector for each state per timestep
+
+        Shapes:
+            - mels_len: (batch_size)
+            - log_alpha_scaled: (batch_size, N, T)
+            - text_lengths: (batch_size)
+            - transition_vector: (batch_size, N, T)
+
+        Returns:
+            sum_final_log_c (torch.FloatTensor): (batch_size)
+
+        """
+        N = torch.max(inputs_len)
+        max_inputs_len = log_alpha_scaled.shape[2]
+        state_lengths_mask = sequence_mask(inputs_len, max_len=max_inputs_len)
+
+        last_log_alpha_scaled_index = (
+            (mels_len - 1).unsqueeze(-1).expand(-1, N).unsqueeze(1)
+        )  # Batch X Hidden State Size
+        last_log_alpha_scaled = torch.gather(log_alpha_scaled, 1, last_log_alpha_scaled_index).squeeze(1)
+        last_log_alpha_scaled = last_log_alpha_scaled.masked_fill(~state_lengths_mask, -float("inf"))
+
+        last_transition_vector = torch.gather(transition_vector, 1, last_log_alpha_scaled_index).squeeze(1)
+        last_transition_probability = torch.sigmoid(last_transition_vector)
+        log_probability_of_transitioning = OverflowUtils.log_clamped(last_transition_probability)
+
+        last_transition_probability_index = self.get_mask_for_last_item(inputs_len, inputs_len.device)
+        log_probability_of_transitioning = log_probability_of_transitioning.masked_fill(
+            ~last_transition_probability_index, -float("inf")
+        )
+        final_log_c = last_log_alpha_scaled + log_probability_of_transitioning
+
+        # If the length of the mel is less than the number of states it will select the -inf values leading to nan gradients
+        # Ideally, we should clean the dataset otherwise this is a little hack uncomment the line below
+        final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
+
+        sum_final_log_c = torch.logsumexp(final_log_c, dim=1)
+        return sum_final_log_c
+
+    @staticmethod
+    def get_mask_for_last_item(lengths, device, out_tensor=None):
+        """Returns n-1 mask for the last item in the sequence.
+
+        Args:
+            lengths (torch.IntTensor): lengths in a batch
+            device (str, optional): Defaults to "cpu".
+            out_tensor (torch.Tensor, optional): uses the memory of a specific tensor.
+                Defaults to None.
+
+        Returns:
+            - Shape: :math:`(b, max_len)`
+        """
+        max_len = torch.max(lengths).item()
+        ids = (
+            torch.arange(0, max_len, device=device) if out_tensor is None else torch.arange(0, max_len, out=out_tensor)
+        )
+        mask = ids == lengths.unsqueeze(1) - 1
+        return mask
+
+    @torch.inference_mode()
+    def inference(
+        self,
+        inputs: torch.FloatTensor,
+        input_lens: torch.LongTensor,
+        sampling_temp: float,
+        max_sampling_time: int,
+        duration_threshold: float,
+    ):
+        """Inference from autoregressive neural HMM
+
+        Args:
+            inputs (torch.FloatTensor): input states
+                - shape: :math:`(b, T, d)`
+            input_lens (torch.LongTensor): input state lengths
+                - shape: :math:`(b)`
+            sampling_temp (float): sampling temperature
+            max_sampling_temp (int): max sampling temperature
+            duration_threshold (float): duration threshold to switch to next state
+                - Use this to change the spearking rate of the synthesised audio
+        """
+
+        b = inputs.shape[0]
+        outputs = {
+            "hmm_outputs": [],
+            "hmm_outputs_len": [],
+            "alignments": [],
+            "input_parameters": [],
+            "output_parameters": [],
+        }
+        for i in range(b):
+            neural_hmm_outputs, states_travelled, input_parameters, output_parameters = self.sample(
+                inputs[i : i + 1], input_lens[i], sampling_temp, max_sampling_time, duration_threshold
+            )
+
+            outputs["hmm_outputs"].append(neural_hmm_outputs)
+            outputs["hmm_outputs_len"].append(neural_hmm_outputs.shape[0])
+            outputs["alignments"].append(states_travelled)
+            outputs["input_parameters"].append(input_parameters)
+            outputs["output_parameters"].append(output_parameters)
+
+        outputs["hmm_outputs"] = nn.utils.rnn.pad_sequence(outputs["hmm_outputs"], batch_first=True)
+        outputs["hmm_outputs_len"] = torch.tensor(
+            outputs["hmm_outputs_len"], dtype=input_lens.dtype, device=input_lens.device
+        )
+        return outputs
+
+    @torch.inference_mode()
+    def sample(self, inputs, input_lens, sampling_temp, max_sampling_time, duration_threshold):
+        """Samples an output from the parameter models
+
+        Args:
+            inputs (torch.FloatTensor): input states
+                - shape: :math:`(1, T, d)`
+            input_lens (torch.LongTensor): input state lengths
+                - shape: :math:`(1)`
+            sampling_temp (float): sampling temperature
+            max_sampling_time (int): max sampling time
+            duration_threshold (float): duration threshold to switch to next state
+
+        Returns:
+            outputs (torch.FloatTensor): Output Observations
+                - Shape: :math:`(T, output_dim)`
+            states_travelled (list[int]): Hidden states travelled
+                - Shape: :math:`(T)`
+            input_parameters (list[torch.FloatTensor]): Input parameters
+            output_parameters (list[torch.FloatTensor]): Output parameters
+        """
+        states_travelled, outputs, t = [], [], 0
+
+        # Sample initial state
+        current_state = 0
+        states_travelled.append(current_state)
+
+        # Prepare autoregression
+        prenet_input = self.go_tokens.unsqueeze(0).expand(1, self.ar_order, self.frame_channels)
+        h_memory, c_memory = self._init_lstm_states(1, self.memory_rnn_dim, prenet_input)
+
+        input_parameter_values = []
+        output_parameter_values = []
+        quantile = 1
+        while True:
+            memory_input = self.prenet(prenet_input.flatten(1).unsqueeze(0))
+            # will be 1 while sampling
+            h_memory, c_memory = self.memory_rnn(memory_input.squeeze(0), (h_memory, c_memory))
+
+            z_t = inputs[:, current_state].unsqueeze(0)  # Add fake time dimension
+            mean, std, transition_vector = self.output_net(h_memory, z_t)
+
+            transition_probability = torch.sigmoid(transition_vector.flatten())
+            staying_probability = torch.sigmoid(-transition_vector.flatten())
+
+            # Save for plotting
+            input_parameter_values.append([prenet_input, current_state])
+            output_parameter_values.append([mean, std, transition_probability])
+
+            x_t = self.emission_model.sample(mean, std, sampling_temp=sampling_temp)
+
+            # Prepare autoregressive input for next iteration
+            prenet_input = torch.cat((prenet_input, x_t), dim=1)[:, 1:]
+
+            outputs.append(x_t.flatten())
+
+            transition_matrix = torch.cat((staying_probability, transition_probability))
+            quantile *= staying_probability
+            if not self.deterministic_transition:
+                switch = transition_matrix.multinomial(1)[0].item()
+            else:
+                switch = quantile < duration_threshold
+
+            if switch:
+                current_state += 1
+                quantile = 1
+
+            states_travelled.append(current_state)
+
+            if (current_state == input_lens) or (max_sampling_time and t == max_sampling_time - 1):
+                break
+
+            t += 1
+
+        return (
+            torch.stack(outputs, dim=0),
+            F.one_hot(input_lens.new_tensor(states_travelled)),
+            input_parameter_values,
+            output_parameter_values,
+        )
+
+    @staticmethod
+    def _initialize_log_state_priors(text_embeddings):
+        """Creates the log pi in forward algorithm.
+
+        Args:
+            text_embeddings (torch.FloatTensor): used to create the log pi
+                    on current device
+
+        Shapes:
+            - text_embeddings: (B, T, D_out_enc)
+        """
+        N = text_embeddings.shape[1]
+        log_state_priors = text_embeddings.new_full([N], -float("inf"))
+        log_state_priors[0] = 0.0
+        return log_state_priors
+
+
+class TransitionModel(nn.Module):
+    """Transition Model of the HMM, it represents the probability of transitioning
+    form current state to all other states"""
+
+    def forward(self, log_alpha_scaled, transition_vector, inputs_len):  # pylint: disable=no-self-use
+        r"""
+        product of the past state with transitional probabilities in log space
+
+        Args:
+            log_alpha_scaled (torch.Tensor): Multiply previous timestep's alphas by
+                        transition matrix (in log domain)
+                - shape: (batch size, N)
+            transition_vector (torch.tensor): transition vector for each state
+                - shape: (N)
+            inputs_len (int tensor): Lengths of states in a batch
+                - shape: (batch)
+
+        Returns:
+            out (torch.FloatTensor): log probability of transitioning to each state
+        """
+        transition_p = torch.sigmoid(transition_vector)
+        staying_p = torch.sigmoid(-transition_vector)
+
+        log_staying_probability = OverflowUtils.log_clamped(staying_p)
+        log_transition_probability = OverflowUtils.log_clamped(transition_p)
+
+        staying = log_alpha_scaled + log_staying_probability
+        leaving = log_alpha_scaled + log_transition_probability
+        leaving = leaving.roll(1, dims=1)
+        leaving[:, 0] = -float("inf")
+        inputs_len_mask = sequence_mask(inputs_len)
+        out = OverflowUtils.logsumexp(torch.stack((staying, leaving), dim=2), dim=2)
+        out = out.masked_fill(~inputs_len_mask, -float("inf"))  # There are no states to contribute to the loss
+        return out
+
+
+class EmissionModel(nn.Module):
+    """Emission Model of the HMM, it represents the probability of
+    emitting an observation based on the current state"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.distribution_function: tdist.Distribution = tdist.normal.Normal
+
+    def sample(self, means, stds, sampling_temp):
+        return self.distribution_function(means, stds * sampling_temp).sample() if sampling_temp > 0 else means
+
+    def forward(self, x_t, means, stds, state_lengths):
+        r"""Calculates the log probability of the the given data (x_t)
+            being observed from states with given means and stds
+        Args:
+            x_t (float tensor) : observation at current time step
+                - shape: (batch, feature_dim)
+            means (float tensor): means of the distributions of hidden states
+                - shape: (batch, hidden_state, feature_dim)
+            stds (float tensor): standard deviations of the distributions of the hidden states
+                - shape: (batch, hidden_state, feature_dim)
+            state_lengths (int tensor): Lengths of states in a batch
+                - shape: (batch)
+
+        Returns:
+            out (float tensor): observation log likelihoods,
+                                    expressing the probability of an observation
+                being generated from a state i
+                shape: (batch, hidden_state)
+        """
+        emission_dists = self.distribution_function(means, stds)
+        out = emission_dists.log_prob(x_t.unsqueeze(1))
+        state_lengths_mask = sequence_mask(state_lengths).unsqueeze(2)
+        out = torch.sum(out * state_lengths_mask, dim=2)
+        return out
--- a/TTS/tts/layers/overflow/plotting_utils.py
+++ b/TTS/tts/layers/overflow/plotting_utils.py
@ -0,0 +1,79 @@
+from typing import Any
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+
+def validate_numpy_array(value: Any):
+    r"""
+    Validates the input and makes sure it returns a numpy array (i.e on CPU)
+
+    Args:
+        value (Any): the input value
+
+    Raises:
+        TypeError: if the value is not a numpy array or torch tensor
+
+    Returns:
+        np.ndarray: numpy array of the value
+    """
+    if isinstance(value, np.ndarray):
+        pass
+    elif isinstance(value, list):
+        value = np.array(value)
+    elif torch.is_tensor(value):
+        value = value.cpu().numpy()
+    else:
+        raise TypeError("Value must be a numpy array, a torch tensor or a list")
+
+    return value
+
+
+def get_spec_from_most_probable_state(log_alpha_scaled, means, decoder=None):
+    """Get the most probable state means from the log_alpha_scaled.
+
+    Args:
+        log_alpha_scaled (torch.Tensor): Log alpha scaled values.
+            - Shape: :math:`(T, N)`
+        means (torch.Tensor): Means of the states.
+            - Shape: :math:`(N, T, D_out)`
+        decoder (torch.nn.Module): Decoder module to decode the latent to melspectrogram. Defaults to None.
+    """
+    max_state_numbers = torch.max(log_alpha_scaled, dim=1)[1]
+    max_len = means.shape[0]
+    n_mel_channels = means.shape[2]
+    max_state_numbers = max_state_numbers.unsqueeze(1).unsqueeze(1).expand(max_len, 1, n_mel_channels)
+    means = torch.gather(means, 1, max_state_numbers).squeeze(1).to(log_alpha_scaled.dtype)
+    if decoder is not None:
+        mel = (
+            decoder(means.T.unsqueeze(0), torch.tensor([means.shape[0]], device=means.device), reverse=True)[0]
+            .squeeze(0)
+            .T
+        )
+    else:
+        mel = means
+    return mel
+
+
+def plot_transition_probabilities_to_numpy(states, transition_probabilities, output_fig=False):
+    """Generates trainsition probabilities plot for the states and the probability of transition.
+
+    Args:
+        states (torch.IntTensor): the states
+        transition_probabilities (torch.FloatTensor): the transition probabilities
+    """
+    states = validate_numpy_array(states)
+    transition_probabilities = validate_numpy_array(transition_probabilities)
+
+    fig, ax = plt.subplots(figsize=(30, 3))
+    ax.plot(transition_probabilities, "o")
+    ax.set_title("Transition probability of state")
+    ax.set_xlabel("hidden state")
+    ax.set_ylabel("probability")
+    ax.set_xticks([i for i in range(len(transition_probabilities))])  # pylint: disable=unnecessary-comprehension
+    ax.set_xticklabels([int(x) for x in states], rotation=90)
+    plt.tight_layout()
+    if not output_fig:
+        plt.close()
+    return fig
--- a/TTS/tts/layers/tacotron/attentions.py
+++ b/TTS/tts/layers/tacotron/attentions.py
@ -50,7 +50,6 @@ class GravesAttention(nn.Module):
    COEF = 0.3989422917366028  # numpy.sqrt(1/(2*numpy.pi))

    def __init__(self, query_dim, K):
-
        super().__init__()
        self._mask_value = 1e-8
        self.K = K
--- a/TTS/tts/layers/tacotron/capacitron_layers.py
+++ b/TTS/tts/layers/tacotron/capacitron_layers.py
@ -83,7 +83,6 @@ class ReferenceEncoder(nn.Module):
    """

    def __init__(self, num_mel, out_dim):
-
        super().__init__()
        self.num_mel = num_mel
        filters = [1] + [32, 32, 64, 64, 128, 128]
--- a/TTS/tts/layers/tacotron/gst_layers.py
+++ b/TTS/tts/layers/tacotron/gst_layers.py
@ -31,7 +31,6 @@ class ReferenceEncoder(nn.Module):
    """

    def __init__(self, num_mel, embedding_dim):
-
        super().__init__()
        self.num_mel = num_mel
        filters = [1] + [32, 32, 64, 64, 128, 128]
@ -119,7 +118,6 @@ class MultiHeadAttention(nn.Module):
    """

    def __init__(self, query_dim, key_dim, num_units, num_heads):
-
        super().__init__()
        self.num_units = num_units
        self.num_heads = num_heads
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .7.1
 .22.0