Rename "pipeline" to "cleaners"

No need to introduce new terminology.
pull/2/head
Keith Ito 2017-09-04 21:54:23 -07:00
parent 9e1ea7a879
commit c4e14ad3b9
4 changed files with 18 additions and 18 deletions

View File

@ -50,21 +50,21 @@ following the example of the other preprocessors in that file.
### Non-English Data ### Non-English Data
If your training data is in a language other than English, you will probably want to change the If your training data is in a language other than English, you will probably want to change the
text cleaning pipeline by setting the `cleaners` hyperparameter. text cleaners by setting the `cleaners` hyperparameter.
* If your text is in a Latin script or can be transliterated to ASCII using the * If your text is in a Latin script or can be transliterated to ASCII using the
[Unidecode](https://pypi.python.org/pypi/Unidecode) library, you can use the transliteration [Unidecode](https://pypi.python.org/pypi/Unidecode) library, you can use the transliteration
pipeline by setting the hyperparameter `cleaners=transliteration_pipeline`. cleaners by setting the hyperparameter `cleaners=transliteration_cleaners`.
* If you don't want to transliterate, you can define a custom character set. * If you don't want to transliterate, you can define a custom character set.
This allows you to train directly on the character set used in your data. This allows you to train directly on the character set used in your data.
To do so, edit [symbols.py](text/symbols.py) and change the `_characters` variable to be a To do so, edit [symbols.py](text/symbols.py) and change the `_characters` variable to be a
string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_pipeline`. string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_cleaners`.
* If you're not sure which option to use, you can evaluate the transliteration pipeline like so: * If you're not sure which option to use, you can evaluate the transliteration cleaners like this:
```python ```python
from text import cleaners from text import cleaners
cleaners.transliteration_pipeline('Здравствуйте') # Replace with the text you want to try cleaners.transliteration_cleaners('Здравствуйте') # Replace with the text you want to try
``` ```

View File

@ -4,8 +4,8 @@ import tensorflow as tf
# Default hyperparameters: # Default hyperparameters:
hparams = tf.contrib.training.HParams( hparams = tf.contrib.training.HParams(
# Comma-separated list of cleaners to run on text prior to training and eval. For non-English # Comma-separated list of cleaners to run on text prior to training and eval. For non-English
# text, you may want to use "basic_pipeline" or "transliteration_pipeline" See TRAINING_DATA.md. # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md.
cleaners='english_pipeline', cleaners='english_cleaners',
# Audio: # Audio:
num_mels=80, num_mels=80,

View File

@ -14,7 +14,7 @@ def test_text_to_sequence():
assert text_to_sequence('"A"_B', []) == [2, 3, 1] assert text_to_sequence('"A"_B', []) == [2, 3, 1]
assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1] assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1]
assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1] assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1]
assert text_to_sequence('A {AW1 S} B', ['english_pipeline']) == [28, 64, 83, 132, 64, 29, 1] assert text_to_sequence('A {AW1 S} B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1]
def test_sequence_to_text(): def test_sequence_to_text():
@ -52,9 +52,9 @@ def test_expand_numbers():
assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.' assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.'
def test_pipelines(): def test_cleaner_pipelines():
text = 'Mr. Müller ate 2 Apples' text = 'Mr. Müller ate 2 Apples'
assert cleaners.english_pipeline(text) == 'mister muller ate two apples' assert cleaners.english_cleaners(text) == 'mister muller ate two apples'
assert cleaners.transliteration_pipeline(text) == 'mr. muller ate 2 apples' assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples'
assert cleaners.basic_pipeline(text) == 'mr. müller ate 2 apples' assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples'

View File

@ -3,10 +3,10 @@ Cleaners are transformations that run over the input text at both training and e
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use: hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_pipeline" for English text 1. "english_cleaners" for English text
2. "transliteration_pipeline" for non-English text that can be transliterated to ASCII using 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode) the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_pipeline" if you do not want to transliterate (in this case, you should also update 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data). the symbols in symbols.py to match your data).
''' '''
@ -63,14 +63,14 @@ def convert_to_ascii(text):
return unidecode(text) return unidecode(text)
def basic_pipeline(text): def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.''' '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
text = lowercase(text) text = lowercase(text)
text = collapse_whitespace(text) text = collapse_whitespace(text)
return text return text
def transliteration_pipeline(text): def transliteration_cleaners(text):
'''Pipeline for non-English text that transliterates to ASCII.''' '''Pipeline for non-English text that transliterates to ASCII.'''
text = convert_to_ascii(text) text = convert_to_ascii(text)
text = lowercase(text) text = lowercase(text)
@ -78,7 +78,7 @@ def transliteration_pipeline(text):
return text return text
def english_pipeline(text): def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.''' '''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text) text = convert_to_ascii(text)
text = lowercase(text) text = lowercase(text)