Rename "pipeline" to "cleaners"

No need to introduce new terminology.
pull/2/head
Keith Ito 2017-09-04 21:54:23 -07:00
parent 9e1ea7a879
commit c4e14ad3b9
4 changed files with 18 additions and 18 deletions

View File

@ -50,21 +50,21 @@ following the example of the other preprocessors in that file.
### Non-English Data
If your training data is in a language other than English, you will probably want to change the
text cleaning pipeline by setting the `cleaners` hyperparameter.
text cleaners by setting the `cleaners` hyperparameter.
* If your text is in a Latin script or can be transliterated to ASCII using the
[Unidecode](https://pypi.python.org/pypi/Unidecode) library, you can use the transliteration
pipeline by setting the hyperparameter `cleaners=transliteration_pipeline`.
cleaners by setting the hyperparameter `cleaners=transliteration_cleaners`.
* If you don't want to transliterate, you can define a custom character set.
This allows you to train directly on the character set used in your data.
To do so, edit [symbols.py](text/symbols.py) and change the `_characters` variable to be a
string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_pipeline`.
string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_cleaners`.
* If you're not sure which option to use, you can evaluate the transliteration pipeline like so:
* If you're not sure which option to use, you can evaluate the transliteration cleaners like this:
```python
from text import cleaners
cleaners.transliteration_pipeline('Здравствуйте') # Replace with the text you want to try
cleaners.transliteration_cleaners('Здравствуйте') # Replace with the text you want to try
```

View File

@ -4,8 +4,8 @@ import tensorflow as tf
# Default hyperparameters:
hparams = tf.contrib.training.HParams(
# Comma-separated list of cleaners to run on text prior to training and eval. For non-English
# text, you may want to use "basic_pipeline" or "transliteration_pipeline" See TRAINING_DATA.md.
cleaners='english_pipeline',
# text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md.
cleaners='english_cleaners',
# Audio:
num_mels=80,

View File

@ -14,7 +14,7 @@ def test_text_to_sequence():
assert text_to_sequence('"A"_B', []) == [2, 3, 1]
assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1]
assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1]
assert text_to_sequence('A {AW1 S} B', ['english_pipeline']) == [28, 64, 83, 132, 64, 29, 1]
assert text_to_sequence('A {AW1 S} B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1]
def test_sequence_to_text():
@ -52,9 +52,9 @@ def test_expand_numbers():
assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.'
def test_pipelines():
def test_cleaner_pipelines():
text = 'Mr. Müller ate 2 Apples'
assert cleaners.english_pipeline(text) == 'mister muller ate two apples'
assert cleaners.transliteration_pipeline(text) == 'mr. muller ate 2 apples'
assert cleaners.basic_pipeline(text) == 'mr. müller ate 2 apples'
assert cleaners.english_cleaners(text) == 'mister muller ate two apples'
assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples'
assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples'

View File

@ -3,10 +3,10 @@ Cleaners are transformations that run over the input text at both training and e
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_pipeline" for English text
2. "transliteration_pipeline" for non-English text that can be transliterated to ASCII using
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_pipeline" if you do not want to transliterate (in this case, you should also update
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
'''
@ -63,14 +63,14 @@ def convert_to_ascii(text):
return unidecode(text)
def basic_pipeline(text):
def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
text = lowercase(text)
text = collapse_whitespace(text)
return text
def transliteration_pipeline(text):
def transliteration_cleaners(text):
'''Pipeline for non-English text that transliterates to ASCII.'''
text = convert_to_ascii(text)
text = lowercase(text)
@ -78,7 +78,7 @@ def transliteration_pipeline(text):
return text
def english_pipeline(text):
def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = lowercase(text)