From 2fd8cf3d94a4c3dd5567e8f444b541a14f116e50 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 27 Nov 2023 14:15:16 +0100
Subject: [PATCH 1/8] Make xtts runnable by version names

---
 TTS/api.py          | 11 +++++++++++
 TTS/utils/manage.py | 35 +++++++++++++++++++++++++++++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index 3331f30e..c207cb71 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -80,6 +80,8 @@ class TTS(nn.Module):
                 self.load_tts_model_by_name(model_name, gpu)
             elif "voice_conversion_models" in model_name:
                 self.load_vc_model_by_name(model_name, gpu)
+            else:
+                self.load_model_by_name(model_name, gpu)
 
         if model_path:
             self.load_tts_model_by_path(
@@ -149,6 +151,15 @@ class TTS(nn.Module):
         vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
         return model_path, config_path, vocoder_path, vocoder_config_path, None
 
+    def load_model_by_name(self, model_name: str, gpu: bool = False):
+        """Load one of the 🐸TTS models by name.
+
+        Args:
+            model_name (str): Model name to load. You can list models by ```tts.models```.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+        self.load_tts_model_by_name(model_name, gpu)
+
     def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
         """Load one of the voice conversion models by name.
 
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index d3eb8104..bdfc2d95 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -1,5 +1,6 @@
 import json
 import os
+import re
 import tarfile
 import zipfile
 from pathlib import Path
@@ -276,13 +277,15 @@ class ModelManager(object):
             model_item["model_url"] = model_item["hf_url"]
         elif "fairseq" in model_item["model_name"]:
             model_item["model_url"] = "https://coqui.gateway.scarf.sh/fairseq/"
+        elif "xtts" in model_item["model_name"]:
+            model_item["model_url"] = "https://coqui.gateway.scarf.sh/xtts/"
         return model_item
 
     def _set_model_item(self, model_name):
         # fetch model info from the dict
-        model_type, lang, dataset, model = model_name.split("/")
-        model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
         if "fairseq" in model_name:
+            model_type = "tts_models"
+            lang = model_name.split("/")[1]
             model_item = {
                 "model_type": "tts_models",
                 "license": "CC BY-NC 4.0",
@@ -291,10 +294,38 @@ class ModelManager(object):
                 "description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.",
             }
             model_item["model_name"] = model_name
+        elif "xtts" in model_name and len(model_name.split("/")) != 4:
+            # loading xtts models with only model name (e.g. xtts_v2.0.2)
+            # check model name has the version number with regex
+            version_regex = r"v\d+\.\d+\.\d+"
+            if re.search(version_regex, model_name):
+                model_version = model_name.split("_")[-1]
+            else:
+                model_version = "main"
+            model_type = "tts_models"
+            lang = "multilingual"
+            dataset = "multi-dataset"
+            model = model_name
+            model_item = {
+                "default_vocoder": None,
+                "license": "CPML",
+                "contact": "info@coqui.ai",
+                "tos_required": True,
+                "hf_url": [
+                    f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/model.pth",
+                    f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json",
+                    f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json",
+                    f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5"
+                ],
+            }
+            print(model_item)
         else:
             # get model from models.json
+            model_type, lang, dataset, model = model_name.split("/")
             model_item = self.models_dict[model_type][lang][dataset][model]
             model_item["model_type"] = model_type
+
+        model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
         md5hash = model_item["model_hash"] if "model_hash" in model_item else None
         model_item = self.set_model_url(model_item)
         return model_item, model_full_name, model, md5hash

From 3b8894a3dd56357dbc5a3e8964abc0e8f89b5757 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 27 Nov 2023 14:15:50 +0100
Subject: [PATCH 2/8] Make style

---
 TTS/api.py                                 | 9 +++++++--
 TTS/tts/layers/xtts/trainer/gpt_trainer.py | 2 +-
 TTS/utils/manage.py                        | 3 +--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index c207cb71..31145464 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -12,6 +12,7 @@ from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
 from TTS.config import load_config
 
+
 class TTS(nn.Module):
     """TODO: Add voice conversion and Capacitron support."""
 
@@ -107,8 +108,12 @@ class TTS(nn.Module):
     @property
     def is_multi_lingual(self):
         # Not sure what sets this to None, but applied a fix to prevent crashing.
-        if (isinstance(self.model_name, str) and "xtts" in self.model_name or
-                self.config and ("xtts" in self.config.model or len(self.config.languages) > 1)):
+        if (
+            isinstance(self.model_name, str)
+            and "xtts" in self.model_name
+            or self.config
+            and ("xtts" in self.config.model or len(self.config.languages) > 1)
+        ):
             return True
         if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
             return self.synthesizer.tts_model.language_manager.num_languages > 1
diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
index 4789e1f4..61222dac 100644
--- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py
+++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
@@ -319,7 +319,7 @@ class GPTTrainer(BaseTTS):
         return self.train_step(batch, criterion)
 
     def on_train_epoch_start(self, trainer):
-        trainer.model.eval() # the whole model to eval
+        trainer.model.eval()  # the whole model to eval
         # put gpt model in training mode
         trainer.model.xtts.gpt.train()
 
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index bdfc2d95..35323782 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -27,7 +27,6 @@ LICENSE_URLS = {
 }
 
 
-
 class ModelManager(object):
     tqdm_progress = None
     """Manage TTS models defined in .models.json.
@@ -315,7 +314,7 @@ class ModelManager(object):
                     f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/model.pth",
                     f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json",
                     f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json",
-                    f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5"
+                    f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5",
                 ],
             }
             print(model_item)

From b75e90ba85c7bc985b13cc0ca08ab29184fbc96f Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 27 Nov 2023 14:53:11 +0100
Subject: [PATCH 3/8] Make text splitting optional

---
 TTS/api.py               | 48 ++++++++++++++++++++++++++++++++++++----
 TTS/utils/manage.py      |  1 -
 TTS/utils/synthesizer.py |  9 ++++++--
 3 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index 31145464..0ae515fa 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -326,6 +326,7 @@ class TTS(nn.Module):
         speaker_wav: str = None,
         emotion: str = None,
         speed: float = None,
+        split_sentences: bool = True,
         **kwargs,
     ):
         """Convert text to speech.
@@ -346,6 +347,12 @@ class TTS(nn.Module):
             speed (float, optional):
                 Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
                 Defaults to None.
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
+            kwargs (dict, optional):
+                Additional arguments for the model.
         """
         self._check_arguments(
             speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
@@ -363,6 +370,7 @@ class TTS(nn.Module):
             style_wav=None,
             style_text=None,
             reference_speaker_name=None,
+            split_sentences=split_sentences,
             **kwargs,
         )
         return wav
@@ -377,6 +385,7 @@ class TTS(nn.Module):
         speed: float = 1.0,
         pipe_out=None,
         file_path: str = "output.wav",
+        split_sentences: bool = True,
         **kwargs,
     ):
         """Convert text to speech.
@@ -401,6 +410,10 @@ class TTS(nn.Module):
                 Flag to stdout the generated TTS wav file for shell pipe.
             file_path (str, optional):
                 Output file path. Defaults to "output.wav".
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
             kwargs (dict, optional):
                 Additional arguments for the model.
         """
@@ -416,7 +429,14 @@ class TTS(nn.Module):
                 file_path=file_path,
                 pipe_out=pipe_out,
             )
-        wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
+        wav = self.tts(
+            text=text,
+            speaker=speaker,
+            language=language,
+            speaker_wav=speaker_wav,
+            split_sentences=split_sentences,
+            **kwargs,
+        )
         self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
         return file_path
 
@@ -456,7 +476,14 @@ class TTS(nn.Module):
         save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
         return file_path
 
-    def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None, speaker: str = None):
+    def tts_with_vc(
+        self,
+        text: str,
+        language: str = None,
+        speaker_wav: str = None,
+        speaker: str = None,
+        split_sentences: bool = True,
+    ):
         """Convert text to speech with voice conversion.
 
         It combines tts with voice conversion to fake voice cloning.
@@ -476,10 +503,16 @@ class TTS(nn.Module):
             speaker (str, optional):
                 Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
                 `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
         """
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
             # Lazy code... save it to a temp file to resample it while reading it for VC
-            self.tts_to_file(text=text, speaker=speaker, language=language, file_path=fp.name)
+            self.tts_to_file(
+                text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
+            )
         if self.voice_converter is None:
             self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
         wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
@@ -492,6 +525,7 @@ class TTS(nn.Module):
         speaker_wav: str = None,
         file_path: str = "output.wav",
         speaker: str = None,
+        split_sentences: bool = True,
     ):
         """Convert text to speech with voice conversion and save to file.
 
@@ -511,6 +545,12 @@ class TTS(nn.Module):
             speaker (str, optional):
                 Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
                 `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
         """
-        wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav, speaker=speaker)
+        wav = self.tts_with_vc(
+            text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
+        )
         save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 35323782..3952504d 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -317,7 +317,6 @@ class ModelManager(object):
                     f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5",
                 ],
             }
-            print(model_item)
         else:
             # get model from models.json
             model_type, lang, dataset, model = model_name.split("/")
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 0d0eb78a..781561f9 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -264,6 +264,7 @@ class Synthesizer(nn.Module):
         style_text=None,
         reference_wav=None,
         reference_speaker_name=None,
+        split_sentences: bool = True,
         **kwargs,
     ) -> List[int]:
         """🐸 TTS magic. Run all the models and generate speech.
@@ -277,6 +278,8 @@ class Synthesizer(nn.Module):
             style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
             reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
             reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
+            split_sentences (bool, optional): split the input text into sentences. Defaults to True.
+            **kwargs: additional arguments to pass to the TTS model.
         Returns:
             List[int]: [description]
         """
@@ -289,8 +292,10 @@ class Synthesizer(nn.Module):
             )
 
         if text:
-            sens = self.split_into_sentences(text)
-            print(" > Text splitted to sentences.")
+            sens = [text]
+            if split_sentences:
+                print(" > Text splitted to sentences.")
+                sens = self.split_into_sentences(text)
             print(sens)
 
         # handle multi-speaker

From 18b7d746cb5c0f294ffd229e0221adced9e461f6 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 27 Nov 2023 14:54:49 +0100
Subject: [PATCH 4/8] Updating XTTS docs

---
 docs/source/models/xtts.md | 66 +++++++++++++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 4 deletions(-)

diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md
index 43f27540..7e461a49 100644
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@@ -39,6 +39,10 @@ You can also mail us at info@coqui.ai.
 #### 🐸TTS API
 
 ##### Single reference
+
+Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio.
+You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit.
+
 ```python
 from TTS.api import TTS
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
@@ -47,14 +51,29 @@ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
 tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
                 file_path="output.wav",
                 speaker_wav=["/path/to/target/speaker.wav"],
-                language="en")
+                language="en",
+                split_sentences=True
+                )
 ```
 
 ##### Multiple references
+
+You can pass multiple audio files to the `speaker_wav` argument for better voice cloning.
+
 ```python
 from TTS.api import TTS
+
+# using the default version set in 🐸TTS
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
 
+# using a specific version
+# 👀 see the branch names for versions on https://huggingface.co/coqui/XTTS-v2/tree/main
+# ❗some versions might be incompatible with the API
+tts = TTS("xtts_v2.0.2", gpu=True)
+
+# getting the latest XTTS_v2
+tts = TTS("xtts", gpu=True)
+
 # generate speech by cloning a voice using default settings
 tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
                 file_path="output.wav",
@@ -62,6 +81,42 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
                 language="en")
 ```
 
+##### Streaming inference
+
+XTTS supports streaming inference. This is useful for real-time applications.
+
+```python
+import os
+import time
+import torch
+import torchaudio
+
+print("Loading model...")
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+model = tts.synthesizer.tts_model
+
+print("Computing speaker latents...")
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
+
+print("Inference...")
+t0 = time.time()
+stream_generator = model.inference_stream(
+    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+    "en",
+    gpt_cond_latent,
+    speaker_embedding
+)
+
+wav_chuncks = []
+for i, chunk in enumerate(stream_generator):
+    if i == 0:
+        print(f"Time to first chunck: {time.time() - t0}")
+    print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
+    wav_chuncks.append(chunk)
+wav = torch.cat(wav_chuncks, dim=0)
+torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
+```
+
 #### 🐸TTS Command line
 
 ##### Single reference
@@ -91,10 +146,13 @@ or for all wav files in a directory you can use:
      --use_cuda true
 ```
 
+#### 🐸TTS Model API
 
-#### model directly
+To use the model API, you need to download the model files and pass config and model file paths manually.
 
-If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
+##### Calling manually
+
+If you want to be able to run with `use_deepspeed=True` and **enjoy the speedup**, you need to install deepspeed first.
 
 ```console
 pip install deepspeed==0.10.3
@@ -129,7 +187,7 @@ torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
 ```
 
 
-#### streaming inference
+##### Streaming manually
 
 Here the goal is to stream the audio as it is being generated. This is useful for real-time applications.
 Streaming inference is typically slower than regular inference, but it allows to get a first chunk of audio faster.

From bfbaffc84a48e362f3039d4128c201a081e6a0ff Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Tue, 28 Nov 2023 13:47:45 +0100
Subject: [PATCH 5/8] Fixup

---
 TTS/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/api.py b/TTS/api.py
index 0ae515fa..b3aa531b 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -76,7 +76,7 @@ class TTS(nn.Module):
         if gpu:
             warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
 
-        if model_name is not None:
+        if model_name is not None and len(model_name) > 0:
             if "tts_models" in model_name or "coqui_studio" in model_name:
                 self.load_tts_model_by_name(model_name, gpu)
             elif "voice_conversion_models" in model_name:

From 39321d02befe17ad49194c0d42f8020d8fb8a856 Mon Sep 17 00:00:00 2001
From: Enno Hermann <Eginhard@users.noreply.github.com>
Date: Thu, 30 Nov 2023 13:03:16 +0100
Subject: [PATCH 6/8] fix: correctly strip/restore initial punctuation (#3336)

* refactor(punctuation): remove orphan code for handling lone punctuation

The case of lone punctuation is already handled at the top of restore(). The
removed if statement would never be called and would in fact raise an
AttributeError because the _punc_index named tuple doesn't have the attribute
`mark`.

* refactor(punctuation): remove unused argument

* fix(punctuation): correctly handle initial punctuation

Stripping and restoring initial punctuation didn't work correctly because the
string-splitting caused an additional empty string to be inserted in the text
list (because `".A".split(".")` => `["", "A"]`). Now, an initial empty string is
skipped and relevant test cases are added.

Fixes #3333
---
 TTS/tts/utils/text/punctuation.py    | 23 +++++++++++------------
 tests/text_tests/test_punctuation.py |  5 +++++
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/TTS/tts/utils/text/punctuation.py b/TTS/tts/utils/text/punctuation.py
index 8d199cc5..36c467d0 100644
--- a/TTS/tts/utils/text/punctuation.py
+++ b/TTS/tts/utils/text/punctuation.py
@@ -15,7 +15,6 @@ class PuncPosition(Enum):
     BEGIN = 0
     END = 1
     MIDDLE = 2
-    ALONE = 3
 
 
 class Punctuation:
@@ -92,7 +91,7 @@ class Punctuation:
             return [text], []
         # the text is only punctuations
         if len(matches) == 1 and matches[0].group() == text:
-            return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
+            return [], [_PUNC_IDX(text, PuncPosition.BEGIN)]
         # build a punctuation map to be used later to restore punctuations
         puncs = []
         for match in matches:
@@ -107,11 +106,14 @@ class Punctuation:
         for idx, punc in enumerate(puncs):
             split = text.split(punc.punc)
             prefix, suffix = split[0], punc.punc.join(split[1:])
+            text = suffix
+            if prefix == "":
+                # We don't want to insert an empty string in case of initial punctuation
+                continue
             splitted_text.append(prefix)
             # if the text does not end with a punctuation, add it to the last item
             if idx == len(puncs) - 1 and len(suffix) > 0:
                 splitted_text.append(suffix)
-            text = suffix
         return splitted_text, puncs
 
     @classmethod
@@ -127,10 +129,10 @@ class Punctuation:
             ['This is', 'example'], ['.', '!'] -> "This is. example!"
 
         """
-        return cls._restore(text, puncs, 0)
+        return cls._restore(text, puncs)
 
     @classmethod
-    def _restore(cls, text, puncs, num):  # pylint: disable=too-many-return-statements
+    def _restore(cls, text, puncs):  # pylint: disable=too-many-return-statements
         """Auxiliary method for Punctuation.restore()"""
         if not puncs:
             return text
@@ -142,21 +144,18 @@ class Punctuation:
         current = puncs[0]
 
         if current.position == PuncPosition.BEGIN:
-            return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num)
+            return cls._restore([current.punc + text[0]] + text[1:], puncs[1:])
 
         if current.position == PuncPosition.END:
-            return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
-
-        if current.position == PuncPosition.ALONE:
-            return [current.mark] + cls._restore(text, puncs[1:], num + 1)
+            return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:])
 
         # POSITION == MIDDLE
         if len(text) == 1:  # pragma: nocover
             # a corner case where the final part of an intermediate
             # mark (I) has not been phonemized
-            return cls._restore([text[0] + current.punc], puncs[1:], num)
+            return cls._restore([text[0] + current.punc], puncs[1:])
 
-        return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
+        return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:])
 
 
 # if __name__ == "__main__":
diff --git a/tests/text_tests/test_punctuation.py b/tests/text_tests/test_punctuation.py
index 141c10e4..bb7b11ed 100644
--- a/tests/text_tests/test_punctuation.py
+++ b/tests/text_tests/test_punctuation.py
@@ -11,6 +11,11 @@ class PunctuationTest(unittest.TestCase):
             ("This, is my text ... to be striped !! from text", "This is my text to be striped from text"),
             ("This, is my text ... to be striped  from text?", "This is my text to be striped  from text"),
             ("This, is my text to be striped from text", "This is my text to be striped from text"),
+            (".", ""),
+            (" . ", ""),
+            ("!!! Attention !!!", "Attention"),
+            ("!!! Attention !!! This is just a ... test.", "Attention This is just a test"),
+            ("!!! Attention! This is just a ... test.", "Attention This is just a test"),
         ]
 
     def test_get_set_puncs(self):

From e40527b103465fc24cb9274b63ad90975c412513 Mon Sep 17 00:00:00 2001
From: Hannes Krumbiegel <Vuizur@users.noreply.github.com>
Date: Thu, 30 Nov 2023 13:03:33 +0100
Subject: [PATCH 7/8] Fix link to installation instructions (#3329)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4e5855f9..ef16c9b6 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ Please use our dedicated channels for questions and discussion. Help is much mor
 | Type                            | Links                               |
 | ------------------------------- | --------------------------------------- |
 | 💼 **Documentation**              | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
-| 💾 **Installation**               | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#install-tts)|
+| 💾 **Installation**               | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)|
 | 👩‍💻 **Contributing**               | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)|
 | 📌 **Road Map**                   | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
 | 🚀 **Released Models**            | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)|

From 6d1905c2b73a80401cdd5e9824e4700e3181a3b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Thu, 30 Nov 2023 13:05:10 +0100
Subject: [PATCH 8/8] Update to v0.21.2

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index a67cebaf..59dad104 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.21.1
+0.21.2