Update Studio API for XTTS (#2861)

* Update Studio API for XTTS * Update the docs * Update README.md * Update README.md Update README
2023-08-13 12:04:12 +02:00 · 2023-08-13 12:04:12 +02:00 · 3a104d5c49
parent 37b558ccb9
commit 3a104d5c49
7 changed files with 432 additions and 258 deletions
--- a/README.md
+++ b/README.md
@ -108,7 +108,7 @@ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not relea
 - Capacitron: [paper](https://arxiv.org/abs/1906.03402)
 - OverFlow: [paper](https://arxiv.org/abs/2211.06892)
 - Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612) 
+- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612)

 ### End-to-End Models
 - VITS: [paper](https://arxiv.org/pdf/2106.06103)
@ -204,9 +204,11 @@ tts = TTS(model_name)
 wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
 # Text to speech to a file
 tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
+```

-# Running a single speaker model
+#### Running a single speaker model

+```python
 # Init TTS with the target model name
 tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
 # Run TTS
@ -218,15 +220,21 @@ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_
 tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
 tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
 tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
+```

+#### Example voice conversion

-# Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
+Converting the voice in `source_wav` to the voice of `target_wav`

+```python
 tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
 tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
+```

-# Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can
-# clone voices by using any model in 🐸TTS.
+#### Example voice cloning together with the voice conversion model.
+This way, you can clone voices by using any model in 🐸TTS.
+
+```python

 tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
 tts.tts_with_vc_to_file(
@ -234,29 +242,43 @@ tts.tts_with_vc_to_file(
    speaker_wav="target/speaker.wav",
    file_path="output.wav"
 )
+```

-# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models.
+#### Example using [🐸Coqui Studio](https://coqui.ai) voices.
+You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai). 
+To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account).
+After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable.

-# You can use all of your available speakers in the studio.
-# [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
-# You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
+Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list. 
+These models will follow the naming convention `coqui_studio/en/<studio_speaker_name>/coqui_studio`

-# If you have a valid API token set you will see the studio speakers as separate models in the list.
-# The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
-models = TTS().list_models()
+```python
+# XTTS model
+models = TTS(cs_api_model="XTTS").list_models()
 # Init TTS with the target studio speaker
 tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False)
 # Run TTS
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
+
+# V1 model
+models = TTS(cs_api_model="V1").list_models()
 # Run TTS with emotion and speed control
+# Emotion control only works with V1 model
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)

+# XTTS-multilingual
+models = TTS(cs_api_model="XTTS-multilingual").list_models()
+# Run TTS with emotion and speed control
+# Emotion control only works with V1 model
+tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
+```

-#Example text to speech using **Fairseq models in ~1100 languages** 🤯.
-
-#For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
-#You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
+#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
+For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
+You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
+and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).

+```python
 # TTS with on the fly voice conversion
 api = TTS("tts_models/deu/fairseq/vits")
 api.tts_with_vc_to_file(
--- a/TTS/api.py
+++ b/TTS/api.py
@ -1,234 +1,15 @@
-import http.client
-import json
-import os
 import tempfile
-import urllib.request
 from pathlib import Path
-from typing import Tuple, Union
+from typing import Union

 import numpy as np
-import requests
-from scipy.io import wavfile

+from TTS.cs_api import CS_API
 from TTS.utils.audio.numpy_transforms import save_wav
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer


-class Speaker(object):
-    """Convert dict to object."""
-
-    def __init__(self, d, is_voice=False):
-        self.is_voice = is_voice
-        for k, v in d.items():
-            if isinstance(k, (list, tuple)):
-                setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
-            else:
-                setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
-
-    def __repr__(self):
-        return str(self.__dict__)
-
-
-class CS_API:
-    """🐸Coqui Studio API Wrapper.
-
-    🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
-    interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
-    characteristics. You can use these voices to generate new audio files or use them in your applications.
-    You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
-    You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
-    https://app.coqui.ai/account. We can either enter the token as an environment variable as
-    `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
-    Visit https://app.coqui.ai/api for more information.
-
-    Example listing all available speakers:
-        >>> from TTS.api import CS_API
-        >>> tts = CS_API()
-        >>> tts.speakers
-
-    Example listing all emotions:
-        >>> from TTS.api import CS_API
-        >>> tts = CS_API()
-        >>> tts.emotions
-
-    Example with a built-in 🐸 speaker:
-        >>> from TTS.api import CS_API
-        >>> tts = CS_API()
-        >>> wav, sr = api.tts("Hello world", speaker_name="Claribel Dervla")
-        >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
-    """
-
-    def __init__(self, api_token=None):
-        self.api_token = api_token
-        self.api_prefix = "/api/v2"
-        self.headers = None
-        self._speakers = None
-        self._check_token()
-
-    @staticmethod
-    def ping_api():
-        URL = "https://coqui.gateway.scarf.sh/tts/api"
-        _ = requests.get(URL)
-
-    @property
-    def speakers(self):
-        if self._speakers is None:
-            self._speakers = self.list_all_speakers()
-        return self._speakers
-
-    @property
-    def emotions(self):
-        """Return a list of available emotions.
-
-        TODO: Get this from the API endpoint.
-        """
-        return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
-
-    def _check_token(self):
-        if self.api_token is None:
-            self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
-            self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
-        if not self.api_token:
-            raise ValueError(
-                "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
-                "Visit 🔗https://app.coqui.ai/account to get one.\n"
-                "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
-                ""
-            )
-
-    def list_all_speakers(self):
-        """Return both built-in Coqui Studio speakers and custom voices created by the user."""
-        return self.list_speakers() + self.list_voices()
-
-    def list_speakers(self):
-        """List built-in Coqui Studio speakers."""
-        self._check_token()
-        conn = http.client.HTTPSConnection("app.coqui.ai")
-        conn.request("GET", f"{self.api_prefix}/speakers?per_page=100", headers=self.headers)
-        res = conn.getresponse()
-        data = res.read()
-        return [Speaker(s) for s in json.loads(data)["result"]]
-
-    def list_voices(self):
-        """List custom voices created by the user."""
-        conn = http.client.HTTPSConnection("app.coqui.ai")
-        conn.request("GET", f"{self.api_prefix}/voices", headers=self.headers)
-        res = conn.getresponse()
-        data = res.read()
-        return [Speaker(s, True) for s in json.loads(data)["result"]]
-
-    def list_speakers_as_tts_models(self):
-        """List speakers in ModelManager format."""
-        models = []
-        for speaker in self.speakers:
-            model = f"coqui_studio/en/{speaker.name}/coqui_studio"
-            models.append(model)
-        return models
-
-    def name_to_speaker(self, name):
-        for speaker in self.speakers:
-            if speaker.name == name:
-                return speaker
-        raise ValueError(f"Speaker {name} not found in {self.speakers}")
-
-    def id_to_speaker(self, speaker_id):
-        for speaker in self.speakers:
-            if speaker.id == speaker_id:
-                return speaker
-        raise ValueError(f"Speaker {speaker_id} not found.")
-
-    @staticmethod
-    def url_to_np(url):
-        tmp_file, _ = urllib.request.urlretrieve(url)
-        rate, data = wavfile.read(tmp_file)
-        return data, rate
-
-    @staticmethod
-    def _create_payload(text, speaker, emotion, speed):
-        payload = {}
-        if speaker.is_voice:
-            payload["voice_id"] = speaker.id
-        else:
-            payload["speaker_id"] = speaker.id
-        payload.update(
-            {
-                "emotion": emotion,
-                "name": speaker.name,
-                "text": text,
-                "speed": speed,
-            }
-        )
-        return payload
-
-    def tts(
-        self,
-        text: str,
-        speaker_name: str = None,
-        speaker_id=None,
-        emotion="Neutral",
-        speed=1.0,
-        language=None,  # pylint: disable=unused-argument
-    ) -> Tuple[np.ndarray, int]:
-        """Synthesize speech from text.
-
-        Args:
-            text (str): Text to synthesize.
-            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
-                voices (user generated speakers) with `list_voices()`.
-            speaker_id (str): Speaker ID. If None, the speaker name is used.
-            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
-            speed (float): Speed of the speech. 1.0 is normal speed.
-            language (str): Language of the text. If None, the default language of the speaker is used.
-        """
-        self._check_token()
-        self.ping_api()
-        if speaker_name is None and speaker_id is None:
-            raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
-        if speaker_id is None:
-            speaker = self.name_to_speaker(speaker_name)
-        else:
-            speaker = self.id_to_speaker(speaker_id)
-        conn = http.client.HTTPSConnection("app.coqui.ai")
-        payload = self._create_payload(text, speaker, emotion, speed)
-        conn.request("POST", "/api/v2/samples", json.dumps(payload), self.headers)
-        res = conn.getresponse()
-        data = res.read()
-        try:
-            wav, sr = self.url_to_np(json.loads(data)["audio_url"])
-        except KeyError as e:
-            raise ValueError(f" [!] 🐸 API returned error: {data}") from e
-        return wav, sr
-
-    def tts_to_file(
-        self,
-        text: str,
-        speaker_name: str,
-        speaker_id=None,
-        emotion="Neutral",
-        speed=1.0,
-        language=None,
-        file_path: str = None,
-    ) -> str:
-        """Synthesize speech from text and save it to a file.
-
-        Args:
-            text (str): Text to synthesize.
-            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
-                voices (user generated speakers) with `list_voices()`.
-            speaker_id (str): Speaker ID. If None, the speaker name is used.
-            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
-            speed (float): Speed of the speech. 1.0 is normal speed.
-            language (str): Language of the text. If None, the default language of the speaker is used.
-            file_path (str): Path to save the file. If None, a temporary file is created.
-        """
-        if file_path is None:
-            file_path = tempfile.mktemp(".wav")
-        wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
-        wavfile.write(file_path, sr, wav)
-        return file_path
-
-
 class TTS:
    """TODO: Add voice conversion and Capacitron support."""

@ -240,6 +21,7 @@ class TTS:
        vocoder_path: str = None,
        vocoder_config_path: str = None,
        progress_bar: bool = True,
+        cs_api_model: str = "XTTS",
        gpu=False,
    ):
        """🐸TTS python interface that allows to load and use the released models.
@ -275,6 +57,9 @@ class TTS:
            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
            vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
+            cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
+                "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control.
+                Defaults to "XTTS".
            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
        """
        self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
@ -282,6 +67,7 @@ class TTS:
        self.synthesizer = None
        self.voice_converter = None
        self.csapi = None
+        self.cs_api_model = cs_api_model
        self.model_name = None

        if model_name is not None:
@ -333,10 +119,9 @@ class TTS:
    def get_models_file_path():
        return Path(__file__).parent / ".models.json"

-    @staticmethod
-    def list_models():
+    def list_models(self):
        try:
-            csapi = CS_API()
+            csapi = CS_API(model=self.cs_api_model)
            models = csapi.list_speakers_as_tts_models()
        except ValueError as e:
            print(e)
@ -468,7 +253,7 @@ class TTS:
        text: str,
        speaker_name: str = None,
        language: str = None,
-        emotion: str = "Neutral",
+        emotion: str = None,
        speed: float = 1.0,
        file_path: str = None,
    ) -> Union[np.ndarray, str]:
@ -479,10 +264,11 @@ class TTS:
                Input text to synthesize.
            speaker_name (str, optional):
                Speaker name from Coqui Studio. Defaults to None.
-            language (str, optional):
-                Language code. Coqui Studio currently supports only English. Defaults to None.
+            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
+                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
            emotion (str, optional):
-                Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Defaults to "Neutral".
+                Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
+                with "V1" model. Defaults to None.
            speed (float, optional):
                Speed of the speech. Defaults to 1.0.
            file_path (str, optional):
@ -521,9 +307,8 @@ class TTS:
            speaker (str, optional):
                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
-            language (str, optional):
-                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
-                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
+                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
            speaker_wav (str, optional):
                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
                Defaults to None.
@ -559,7 +344,7 @@ class TTS:
        speaker: str = None,
        language: str = None,
        speaker_wav: str = None,
-        emotion: str = "Neutral",
+        emotion: str = None,
        speed: float = 1.0,
        file_path: str = "output.wav",
        **kwargs,
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -185,11 +185,22 @@ If you don't specify any models, then it uses LJSpeech based English model.
    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)

    # args for coqui studio
+    parser.add_argument(
+        "--cs_model",
+        type=str,
+        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.",
+    )
    parser.add_argument(
        "--emotion",
        type=str,
-        help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.",
-        default="Neutral",
+        help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.",
+        default=None,
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
+        default=None,
    )

    # args for multi-speaker synthesis
@ -335,8 +346,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
    # CASE3: TTS with coqui studio models
    if "coqui_studio" in args.model_name:
        print(" > Using 🐸Coqui Studio model: ", args.model_name)
-        api = TTS(model_name=args.model_name)
-        api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path)
+        api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
+        api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path, language=args.language)
        print(" > Saving output to ", args.out_path)
        return

--- a/TTS/cs_api.py
+++ b/TTS/cs_api.py
@ -0,0 +1,338 @@
+import http.client
+import json
+import os
+import tempfile
+import urllib.request
+from typing import Tuple
+
+import numpy as np
+import requests
+from scipy.io import wavfile
+
+
+class Speaker(object):
+    """Convert dict to object."""
+
+    def __init__(self, d, is_voice=False):
+        self.is_voice = is_voice
+        for k, v in d.items():
+            if isinstance(k, (list, tuple)):
+                setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
+            else:
+                setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+
+class CS_API:
+    """🐸Coqui Studio API Wrapper.
+
+    🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
+    interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
+    characteristics. You can use these voices to generate new audio files or use them in your applications.
+    You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
+    You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
+    https://app.coqui.ai/account. We can either enter the token as an environment variable as
+    `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
+    Visit https://app.coqui.ai/api for more information.
+
+
+    Args:
+        api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
+            `COQUI_STUDIO_TOKEN`.
+        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`.
+
+
+    Example listing all available speakers:
+        >>> from TTS.api import CS_API
+        >>> tts = CS_API()
+        >>> tts.speakers
+
+    Example listing all emotions:
+        >>> # emotions are only available for `V1` model
+        >>> from TTS.api import CS_API
+        >>> tts = CS_API(model="V1")
+        >>> tts.emotions
+
+    Example with a built-in 🐸 speaker:
+        >>> from TTS.api import CS_API
+        >>> tts = CS_API()
+        >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name)
+        >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
+
+    Example with multi-language model:
+        >>> from TTS.api import CS_API
+        >>> tts = CS_API(model="XTTS-multilang")
+        >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
+    """
+
+    MODEL_ENDPOINTS = {
+        "V1": {
+            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
+            "synthesize": "https://app.coqui.ai/api/v2/samples",
+            "list_voices": "https://app.coqui.ai/api/v2/voices",
+        },
+        "XTTS": {
+            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
+            "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
+            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
+        },
+        "XTTS-multilang": {
+            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
+            "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
+            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
+        },
+    }
+
+    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
+
+    def __init__(self, api_token=None, model="XTTS"):
+        self.api_token = api_token
+        self.model = model
+        self.headers = None
+        self._speakers = None
+        self._check_token()
+
+    @staticmethod
+    def ping_api():
+        URL = "https://coqui.gateway.scarf.sh/tts/api"
+        _ = requests.get(URL)
+
+    @property
+    def speakers(self):
+        if self._speakers is None:
+            self._speakers = self.list_all_speakers()
+        return self._speakers
+
+    @property
+    def emotions(self):
+        """Return a list of available emotions.
+
+        TODO: Get this from the API endpoint.
+        """
+        if self.model == "V1":
+            return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
+        else:
+            raise ValueError(f"❗ Emotions are not available for {self.model}.")
+
+    def _check_token(self):
+        if self.api_token is None:
+            self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
+            self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
+        if not self.api_token:
+            raise ValueError(
+                "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
+                "Visit 🔗https://app.coqui.ai/account to get one.\n"
+                "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
+                ""
+            )
+
+    def list_all_speakers(self):
+        """Return both built-in Coqui Studio speakers and custom voices created by the user."""
+        return self.list_speakers() + self.list_voices()
+
+    def list_speakers(self):
+        """List built-in Coqui Studio speakers."""
+        self._check_token()
+        conn = http.client.HTTPSConnection("app.coqui.ai")
+        url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
+        conn.request("GET", f"{url}?per_page=100", headers=self.headers)
+        res = conn.getresponse()
+        data = res.read()
+        return [Speaker(s) for s in json.loads(data)["result"]]
+
+    def list_voices(self):
+        """List custom voices created by the user."""
+        conn = http.client.HTTPSConnection("app.coqui.ai")
+        url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
+        conn.request("GET", f"{url}", headers=self.headers)
+        res = conn.getresponse()
+        data = res.read()
+        return [Speaker(s, True) for s in json.loads(data)["result"]]
+
+    def list_speakers_as_tts_models(self):
+        """List speakers in ModelManager format."""
+        models = []
+        for speaker in self.speakers:
+            model = f"coqui_studio/multilingual/{speaker.name}/{self.model}"
+            models.append(model)
+        return models
+
+    def name_to_speaker(self, name):
+        for speaker in self.speakers:
+            if speaker.name == name:
+                return speaker
+        raise ValueError(f"Speaker {name} not found in {self.speakers}")
+
+    def id_to_speaker(self, speaker_id):
+        for speaker in self.speakers:
+            if speaker.id == speaker_id:
+                return speaker
+        raise ValueError(f"Speaker {speaker_id} not found.")
+
+    @staticmethod
+    def url_to_np(url):
+        tmp_file, _ = urllib.request.urlretrieve(url)
+        rate, data = wavfile.read(tmp_file)
+        return data, rate
+
+    @staticmethod
+    def _create_payload(model, text, speaker, speed, emotion, language):
+        payload = {}
+        # if speaker.is_voice:
+        payload["voice_id"] = speaker.id
+        # else:
+        payload["speaker_id"] = speaker.id
+
+        if model == "V1":
+            payload.update(
+                {
+                    "emotion": emotion,
+                    "name": speaker.name,
+                    "text": text,
+                    "speed": speed,
+                }
+            )
+        elif model == "XTTS":
+            payload.update(
+                {
+                    "name": speaker.name,
+                    "text": text,
+                    "speed": speed,
+                }
+            )
+        elif model == "XTTS-multilang":
+            payload.update(
+                {
+                    "name": speaker.name,
+                    "text": text,
+                    "speed": speed,
+                    "language": language,
+                }
+            )
+        else:
+            raise ValueError(f"❗ Unknown model {model}")
+        return payload
+
+    def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language):
+        assert text is not None, "❗ text is required for V1 model."
+        assert speaker_name is not None, "❗ speaker_name is required for V1 model."
+        if self.model == "V1":
+            if emotion is None:
+                emotion = "Neutral"
+            assert language is None, "❗ language is not supported for V1 model."
+        elif self.model == "XTTS":
+            assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
+            assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model."
+        elif self.model == "XTTS-multilang":
+            assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
+            assert language is not None, "❗ Language is required for XTTS-multilang model."
+            assert (
+                language in self.SUPPORTED_LANGUAGES
+            ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl"
+        return text, speaker_name, speaker_id, emotion, speed, language
+
+    def tts(
+        self,
+        text: str,
+        speaker_name: str = None,
+        speaker_id=None,
+        emotion=None,
+        speed=1.0,
+        language=None,  # pylint: disable=unused-argument
+    ) -> Tuple[np.ndarray, int]:
+        """Synthesize speech from text.
+
+        Args:
+            text (str): Text to synthesize.
+            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
+                voices (user generated speakers) with `list_voices()`.
+            speaker_id (str): Speaker ID. If None, the speaker name is used.
+            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only
+                supported by `V1` model. Defaults to None.
+            speed (float): Speed of the speech. 1.0 is normal speed.
+            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
+                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+        """
+        self._check_token()
+        self.ping_api()
+
+        if speaker_name is None and speaker_id is None:
+            raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
+        if speaker_id is None:
+            speaker = self.name_to_speaker(speaker_name)
+        else:
+            speaker = self.id_to_speaker(speaker_id)
+
+        text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args(
+            text, speaker_name, speaker_id, emotion, speed, language
+        )
+
+        conn = http.client.HTTPSConnection("app.coqui.ai")
+        payload = self._create_payload(self.model, text, speaker, speed, emotion, language)
+        url = self.MODEL_ENDPOINTS[self.model]["synthesize"]
+        conn.request("POST", url, json.dumps(payload), self.headers)
+        res = conn.getresponse()
+        data = res.read()
+        try:
+            wav, sr = self.url_to_np(json.loads(data)["audio_url"])
+        except KeyError as e:
+            raise ValueError(f" [!] 🐸 API returned error: {data}") from e
+        return wav, sr
+
+    def tts_to_file(
+        self,
+        text: str,
+        speaker_name: str,
+        speaker_id=None,
+        emotion=None,
+        speed=1.0,
+        language=None,
+        file_path: str = None,
+    ) -> str:
+        """Synthesize speech from text and save it to a file.
+
+        Args:
+            text (str): Text to synthesize.
+            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
+                voices (user generated speakers) with `list_voices()`.
+            speaker_id (str): Speaker ID. If None, the speaker name is used.
+            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
+            speed (float): Speed of the speech. 1.0 is normal speed.
+            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
+                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+            file_path (str): Path to save the file. If None, a temporary file is created.
+        """
+        if file_path is None:
+            file_path = tempfile.mktemp(".wav")
+        wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
+        wavfile.write(file_path, sr, wav)
+        return file_path
+
+
+if __name__ == "__main__":
+    import time
+
+    api = CS_API()
+    print(api.speakers)
+    print(api.list_speakers_as_tts_models())
+
+    ts = time.time()
+    wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name)
+    print(f" [i] XTTS took {time.time() - ts:.2f}s")
+
+    filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav")
+
+    api = CS_API(model="XTTS-multilang")
+    print(api.speakers)
+
+    ts = time.time()
+    wav, sr = api.tts(
+        "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
+    )
+    print(f" [i] XTTS took {time.time() - ts:.2f}s")
+
+    filepath = api.tts_to_file(
+        text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
+    )
--- a/TTS/tts/models/tortoise.py
+++ b/TTS/tts/models/tortoise.py
@ -72,7 +72,7 @@ def load_discrete_vocoder_diffuser(
    )


-def format_conditioning(clip, cond_length=132300, device="cuda"):
+def format_conditioning(clip, cond_length=132300, device="cuda", **kwargs):
    """
    Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
    """
@ -82,7 +82,7 @@ def format_conditioning(clip, cond_length=132300, device="cuda"):
    elif gap > 0:
        rand_start = random.randint(0, gap)
        clip = clip[:, rand_start : rand_start + cond_length]
-    mel_clip = TorchMelSpectrogram()(clip.unsqueeze(0)).squeeze(0)
+    mel_clip = TorchMelSpectrogram(**kwargs)(clip.unsqueeze(0)).squeeze(0)
    return mel_clip.unsqueeze(0).to(device)


@ -321,6 +321,7 @@ class Tortoise(BaseTTS):

    def __init__(self, config: Coqpit):
        super().__init__(config, ap=None, tokenizer=None)
+        self.mel_norm_path = None
        self.config = config
        self.ar_checkpoint = self.args.ar_checkpoint
        self.diff_checkpoint = self.args.diff_checkpoint  # TODO: check if this is even needed
@ -429,7 +430,7 @@ class Tortoise(BaseTTS):

            auto_conds = []
            for ls in voice_samples:
-                auto_conds.append(format_conditioning(ls[0], device=self.device))
+                auto_conds.append(format_conditioning(ls[0], device=self.device, mel_norm_file=self.mel_norm_path))
            auto_conds = torch.stack(auto_conds, dim=1)
            with self.temporary_cuda(self.autoregressive) as ar:
                auto_latent = ar.get_conditioning(auto_conds)
@ -873,6 +874,7 @@ class Tortoise(BaseTTS):
        diff_path = diff_checkpoint_path or os.path.join(checkpoint_dir, "diffusion_decoder.pth")
        clvp_path = clvp_checkpoint_path or os.path.join(checkpoint_dir, "clvp2.pth")
        vocoder_checkpoint_path = vocoder_checkpoint_path or os.path.join(checkpoint_dir, "vocoder.pth")
+        self.mel_norm_path = os.path.join(checkpoint_dir, "mel_norms.pth")

        if os.path.exists(ar_path):
            # remove keys from the checkpoint that are not in the model
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -88,7 +88,7 @@ class ModelManager(object):

    def _list_models(self, model_type, model_count=0):
        if self.verbose:
-            print(" Name format: type/language/dataset/model")
+            print("\n Name format: type/language/dataset/model")
        model_list = []
        for lang in self.models_dict[model_type]:
            for dataset in self.models_dict[model_type][lang]:
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@ -191,9 +191,25 @@ from TTS.api import CS_API

 # Init 🐸 Coqui Studio API
 # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
-api = CS_API(api_token=<token>)
+
+# XTTS - Best quality and life-like speech in EN
+api = CS_API(api_token=<token>, model="XTTS")
+api.speakers  # all the speakers are available with all the models.
+api.list_speakers()
+api.list_voices()
+wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
+
+# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
+api = CS_API(api_token=<token>, model="XTTS-multilingual")
 api.speakers
-api.emotions
+api.list_speakers()
+api.list_voices()
+wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
+
+# V1 - Fast and lightweight TTS in EN with emotion control.
+api = CS_API(api_token=<token>, model="V1")
+api.speakers
+api.emotions  # emotions are only for the V1 model.
 api.list_speakers()
 api.list_voices()
 wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)