mirror of https://github.com/MycroftAI/mimic2.git
parent
9e1ea7a879
commit
c4e14ad3b9
|
@ -50,21 +50,21 @@ following the example of the other preprocessors in that file.
|
|||
### Non-English Data
|
||||
|
||||
If your training data is in a language other than English, you will probably want to change the
|
||||
text cleaning pipeline by setting the `cleaners` hyperparameter.
|
||||
text cleaners by setting the `cleaners` hyperparameter.
|
||||
|
||||
* If your text is in a Latin script or can be transliterated to ASCII using the
|
||||
[Unidecode](https://pypi.python.org/pypi/Unidecode) library, you can use the transliteration
|
||||
pipeline by setting the hyperparameter `cleaners=transliteration_pipeline`.
|
||||
cleaners by setting the hyperparameter `cleaners=transliteration_cleaners`.
|
||||
|
||||
* If you don't want to transliterate, you can define a custom character set.
|
||||
This allows you to train directly on the character set used in your data.
|
||||
|
||||
To do so, edit [symbols.py](text/symbols.py) and change the `_characters` variable to be a
|
||||
string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_pipeline`.
|
||||
string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_cleaners`.
|
||||
|
||||
* If you're not sure which option to use, you can evaluate the transliteration pipeline like so:
|
||||
* If you're not sure which option to use, you can evaluate the transliteration cleaners like this:
|
||||
|
||||
```python
|
||||
from text import cleaners
|
||||
cleaners.transliteration_pipeline('Здравствуйте') # Replace with the text you want to try
|
||||
cleaners.transliteration_cleaners('Здравствуйте') # Replace with the text you want to try
|
||||
```
|
||||
|
|
|
@ -4,8 +4,8 @@ import tensorflow as tf
|
|||
# Default hyperparameters:
|
||||
hparams = tf.contrib.training.HParams(
|
||||
# Comma-separated list of cleaners to run on text prior to training and eval. For non-English
|
||||
# text, you may want to use "basic_pipeline" or "transliteration_pipeline" See TRAINING_DATA.md.
|
||||
cleaners='english_pipeline',
|
||||
# text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md.
|
||||
cleaners='english_cleaners',
|
||||
|
||||
# Audio:
|
||||
num_mels=80,
|
||||
|
|
|
@ -14,7 +14,7 @@ def test_text_to_sequence():
|
|||
assert text_to_sequence('"A"_B', []) == [2, 3, 1]
|
||||
assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1]
|
||||
assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1]
|
||||
assert text_to_sequence('A {AW1 S} B', ['english_pipeline']) == [28, 64, 83, 132, 64, 29, 1]
|
||||
assert text_to_sequence('A {AW1 S} B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1]
|
||||
|
||||
|
||||
def test_sequence_to_text():
|
||||
|
@ -52,9 +52,9 @@ def test_expand_numbers():
|
|||
assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.'
|
||||
|
||||
|
||||
def test_pipelines():
|
||||
def test_cleaner_pipelines():
|
||||
text = 'Mr. Müller ate 2 Apples'
|
||||
assert cleaners.english_pipeline(text) == 'mister muller ate two apples'
|
||||
assert cleaners.transliteration_pipeline(text) == 'mr. muller ate 2 apples'
|
||||
assert cleaners.basic_pipeline(text) == 'mr. müller ate 2 apples'
|
||||
assert cleaners.english_cleaners(text) == 'mister muller ate two apples'
|
||||
assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples'
|
||||
assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples'
|
||||
|
||||
|
|
|
@ -3,10 +3,10 @@ Cleaners are transformations that run over the input text at both training and e
|
|||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_pipeline" for English text
|
||||
2. "transliteration_pipeline" for non-English text that can be transliterated to ASCII using
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_pipeline" if you do not want to transliterate (in this case, you should also update
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
'''
|
||||
|
||||
|
@ -63,14 +63,14 @@ def convert_to_ascii(text):
|
|||
return unidecode(text)
|
||||
|
||||
|
||||
def basic_pipeline(text):
|
||||
def basic_cleaners(text):
|
||||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def transliteration_pipeline(text):
|
||||
def transliteration_cleaners(text):
|
||||
'''Pipeline for non-English text that transliterates to ASCII.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
|
@ -78,7 +78,7 @@ def transliteration_pipeline(text):
|
|||
return text
|
||||
|
||||
|
||||
def english_pipeline(text):
|
||||
def english_cleaners(text):
|
||||
'''Pipeline for English text, including number and abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
|
|
Loading…
Reference in New Issue