diff --git a/TRAINING_DATA.md b/TRAINING_DATA.md index a96b63c..f1ac3cb 100644 --- a/TRAINING_DATA.md +++ b/TRAINING_DATA.md @@ -50,21 +50,21 @@ following the example of the other preprocessors in that file. ### Non-English Data If your training data is in a language other than English, you will probably want to change the -text cleaning pipeline by setting the `cleaners` hyperparameter. +text cleaners by setting the `cleaners` hyperparameter. * If your text is in a Latin script or can be transliterated to ASCII using the [Unidecode](https://pypi.python.org/pypi/Unidecode) library, you can use the transliteration - pipeline by setting the hyperparameter `cleaners=transliteration_pipeline`. + cleaners by setting the hyperparameter `cleaners=transliteration_cleaners`. * If you don't want to transliterate, you can define a custom character set. This allows you to train directly on the character set used in your data. To do so, edit [symbols.py](text/symbols.py) and change the `_characters` variable to be a - string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_pipeline`. + string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_cleaners`. - * If you're not sure which option to use, you can evaluate the transliteration pipeline like so: + * If you're not sure which option to use, you can evaluate the transliteration cleaners like this: ```python from text import cleaners - cleaners.transliteration_pipeline('Здравствуйте') # Replace with the text you want to try + cleaners.transliteration_cleaners('Здравствуйте') # Replace with the text you want to try ``` diff --git a/hparams.py b/hparams.py index 23e6796..352c655 100644 --- a/hparams.py +++ b/hparams.py @@ -4,8 +4,8 @@ import tensorflow as tf # Default hyperparameters: hparams = tf.contrib.training.HParams( # Comma-separated list of cleaners to run on text prior to training and eval. For non-English - # text, you may want to use "basic_pipeline" or "transliteration_pipeline" See TRAINING_DATA.md. - cleaners='english_pipeline', + # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md. + cleaners='english_cleaners', # Audio: num_mels=80, diff --git a/tests/text_test.py b/tests/text_test.py index a222ed6..242a44c 100644 --- a/tests/text_test.py +++ b/tests/text_test.py @@ -14,7 +14,7 @@ def test_text_to_sequence(): assert text_to_sequence('"A"_B', []) == [2, 3, 1] assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1] assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1] - assert text_to_sequence('A {AW1 S} B', ['english_pipeline']) == [28, 64, 83, 132, 64, 29, 1] + assert text_to_sequence('A {AW1 S} B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1] def test_sequence_to_text(): @@ -52,9 +52,9 @@ def test_expand_numbers(): assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.' -def test_pipelines(): +def test_cleaner_pipelines(): text = 'Mr. Müller ate 2 Apples' - assert cleaners.english_pipeline(text) == 'mister muller ate two apples' - assert cleaners.transliteration_pipeline(text) == 'mr. muller ate 2 apples' - assert cleaners.basic_pipeline(text) == 'mr. müller ate 2 apples' + assert cleaners.english_cleaners(text) == 'mister muller ate two apples' + assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples' + assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples' diff --git a/text/cleaners.py b/text/cleaners.py index 5eedca0..aa56c4c 100644 --- a/text/cleaners.py +++ b/text/cleaners.py @@ -3,10 +3,10 @@ Cleaners are transformations that run over the input text at both training and e Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_pipeline" for English text - 2. "transliteration_pipeline" for non-English text that can be transliterated to ASCII using + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_pipeline" if you do not want to transliterate (in this case, you should also update + 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update the symbols in symbols.py to match your data). ''' @@ -63,14 +63,14 @@ def convert_to_ascii(text): return unidecode(text) -def basic_pipeline(text): +def basic_cleaners(text): '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' text = lowercase(text) text = collapse_whitespace(text) return text -def transliteration_pipeline(text): +def transliteration_cleaners(text): '''Pipeline for non-English text that transliterates to ASCII.''' text = convert_to_ascii(text) text = lowercase(text) @@ -78,7 +78,7 @@ def transliteration_pipeline(text): return text -def english_pipeline(text): +def english_cleaners(text): '''Pipeline for English text, including number and abbreviation expansion.''' text = convert_to_ascii(text) text = lowercase(text)