diff --git a/LICENSE b/LICENSE index 4ad4ed1..b09cd78 100644 --- a/LICENSE +++ b/LICENSE @@ -1,19 +1,201 @@ -Copyright (c) 2017 Keith Ito +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. + 1. Definitions. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 18aebe3..f23951a 100644 --- a/README.md +++ b/README.md @@ -1,168 +1,6 @@ -# Tacotron +# mimic2 -An implementation of Tacotron speech synthesis in TensorFlow. +This is a fork of [keithito/tacotron](https://github.com/keithito/tacotron) +with changes specific to Mimic 2 applied. - -### Audio Samples - - * **[Audio Samples](https://keithito.github.io/audio-samples/)** from models trained using this repo. - * The first set was trained for 877K steps on the [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/) - * Speech started to become intelligble around 20K steps. - * Although loss continued to decrease, there wasn't much noticable improvement after ~250K steps. - * The second set was trained by [@MXGray](https://github.com/MXGray) for 140K steps on the [Nancy Corpus](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/). - - - -## Background - -Earlier this year, Google published a paper, [Tacotron: A Fully End-to-End Text-To-Speech Synthesis Model](https://arxiv.org/pdf/1703.10135.pdf), -where they present a neural text-to-speech model that learns to synthesize speech directly from -(text, audio) pairs. However, they didn't release their source code or training data. This is an -attempt to provide an open-source implementation of the model described in their paper. - -The quality isn't as good as Google's demo yet, but hopefully it will get there someday :-). -Pull requests are welcome! - - - -## Quick Start - -### Installing dependencies - -1. Install Python 3. - -2. Install the latest version of [TensorFlow](https://www.tensorflow.org/install/) for your platform. For better - performance, install with GPU support if it's available. This code works with TensorFlow 1.3 or 1.4. - -3. Install requirements: - ``` - pip install -r requirements.txt - ``` - - -### Using a pre-trained model - -1. **Download and unpack a model**: - ``` - curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xjC /tmp - ``` - -2. **Run the demo server**: - ``` - python3 demo_server.py --checkpoint /tmp/tacotron-20170720/model.ckpt - ``` - -3. **Point your browser at localhost:9000** - * Type what you want to synthesize - - - -### Training - -*Note: you need at least 40GB of free disk space to train a model.* - -1. **Download a speech dataset.** - - The following are supported out of the box: - * [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain) - * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike) - - You can use other datasets if you convert them to the right format. See [TRAINING_DATA.md](TRAINING_DATA.md) for more info. - - -2. **Unpack the dataset into `~/tacotron`** - - After unpacking, your tree should look like this for LJ Speech: - ``` - tacotron - |- LJSpeech-1.0 - |- metadata.csv - |- wavs - ``` - - or like this for Blizzard 2012: - ``` - tacotron - |- Blizzard2012 - |- ATrampAbroad - | |- sentence_index.txt - | |- lab - | |- wav - |- TheManThatCorruptedHadleyburg - |- sentence_index.txt - |- lab - |- wav - ``` - -3. **Preprocess the data** - ``` - python3 preprocess.py --dataset ljspeech - ``` - * Use `--dataset blizzard` for Blizzard data - -4. **Train a model** - ``` - python3 train.py - ``` - - Tunable hyperparameters are found in [hparams.py](hparams.py). You can adjust these at the command - line using the `--hparams` flag, for example `--hparams="batch_size=16,outputs_per_step=2"`. - Hyperparameters should generally be set to the same values at both training and eval time. - - -5. **Monitor with Tensorboard** (optional) - ``` - tensorboard --logdir ~/tacotron/logs-tacotron - ``` - - The trainer dumps audio and alignments every 1000 steps. You can find these in - `~/tacotron/logs-tacotron`. - -6. **Synthesize from a checkpoint** - ``` - python3 demo_server.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000 - ``` - Replace "185000" with the checkpoint number that you want to use, then open a browser - to `localhost:9000` and type what you want to speak. Alternately, you can - run [eval.py](eval.py) at the command line: - ``` - python3 eval.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000 - ``` - If you set the `--hparams` flag when training, set the same value here. - - -## Notes and Common Issues - - * [TCMalloc](http://goog-perftools.sourceforge.net/doc/tcmalloc.html) seems to improve - training speed and avoids occasional slowdowns seen with the default allocator. You - can enable it by installing it and setting `LD_PRELOAD=/usr/lib/libtcmalloc.so`. - - * You can train with [CMUDict](http://www.speech.cs.cmu.edu/cgi-bin/cmudict) by downloading the - dictionary to ~/tacotron/training and then passing the flag `--hparams="use_cmudict=True"` to - train.py. This will allow you to pass ARPAbet phonemes enclosed in curly braces at eval - time to force a particular pronunciation, e.g. `Turn left on {HH AW1 S S T AH0 N} Street.` - - * If you pass a Slack incoming webhook URL as the `--slack_url` flag to train.py, it will send - you progress updates every 1000 steps. - - * Occasionally, you may see a spike in loss and the model will forget how to attend (the - alignments will no longer make sense). Although it will recover eventually, it may - save time to restart at a checkpoint prior to the spike by passing the - `--restore_step=150000` flag to train.py (replacing 150000 with a step number prior to the - spike). **Update**: a recent [fix](https://github.com/keithito/tacotron/pull/7) to gradient - clipping by @candlewill may have fixed this. - - * During eval and training, audio length is limited to `max_iters * outputs_per_step * frame_shift_ms` - milliseconds. With the defaults (max_iters=200, outputs_per_step=5, frame_shift_ms=12.5), this is - 12.5 seconds. - - If your training examples are longer, you will see an error like this: - `Incompatible shapes: [32,1340,80] vs. [32,1000,80]` - - To fix this, you can set a larger value of `max_iters` by passing `--hparams="max_iters=300"` to - train.py (replace "300" with a value based on how long your audio is and the formula above). - - -## Other Implementations - * By Alex Barron: https://github.com/barronalex/Tacotron - * By Kyubyong Park: https://github.com/Kyubyong/tacotron +Copyright (c) 2017 Keith Ito diff --git a/requirements.txt b/requirements.txt index 67a7a00..6fd141d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,8 @@ # Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install # depends on your platform. It is assumed you have already installed tensorflow. falcon==1.2.0 -inflect==0.2.5 librosa==0.5.1 matplotlib==2.0.2 numpy==1.13.0 scipy==0.19.0 tqdm==4.11.2 -Unidecode==0.4.20 diff --git a/tests/numbers_test.py b/tests/numbers_test.py index 7fa6b60..a45b4ac 100644 --- a/tests/numbers_test.py +++ b/tests/numbers_test.py @@ -2,40 +2,39 @@ from text.numbers import normalize_numbers def test_normalize_numbers(): + assert normalize_numbers('0') == 'zero' assert normalize_numbers('1') == 'one' assert normalize_numbers('15') == 'fifteen' - assert normalize_numbers('24') == 'twenty-four' + assert normalize_numbers('24') == 'twenty four' assert normalize_numbers('100') == 'one hundred' assert normalize_numbers('101') == 'one hundred one' - assert normalize_numbers('456') == 'four hundred fifty-six' + assert normalize_numbers('456') == 'four hundred fifty six' assert normalize_numbers('1000') == 'one thousand' assert normalize_numbers('1800') == 'eighteen hundred' assert normalize_numbers('2,000') == 'two thousand' assert normalize_numbers('3000') == 'three thousand' assert normalize_numbers('18000') == 'eighteen thousand' - assert normalize_numbers('24,000') == 'twenty-four thousand' - assert normalize_numbers('124,001') == 'one hundred twenty-four thousand one' + assert normalize_numbers('24,000') == 'twenty four thousand' + assert normalize_numbers('124,001') == 'one hundred twenty four thousand one' + assert normalize_numbers('999,999') == 'nine hundred ninety nine thousand nine hundred ninety nine' + assert normalize_numbers('1000000002') == 'one billion two' + assert normalize_numbers('1200000000') == 'one billion two hundred million' + assert normalize_numbers('19800000004001') == 'nineteen trillion eight hundred billion four thousand one' + assert normalize_numbers('712000000000000000') == 'seven hundred twelve quadrillion' + assert normalize_numbers('1000000000000000000') == '1000000000000000000' assert normalize_numbers('6.4 sec') == 'six point four sec' def test_normalize_ordinals(): assert normalize_numbers('1st') == 'first' assert normalize_numbers('2nd') == 'second' + assert normalize_numbers('5th') == 'fifth' assert normalize_numbers('9th') == 'ninth' - assert normalize_numbers('243rd place') == 'two hundred and forty-third place' - - -def test_normalize_dates(): - assert normalize_numbers('1400') == 'fourteen hundred' - assert normalize_numbers('1901') == 'nineteen oh one' - assert normalize_numbers('1999') == 'nineteen ninety-nine' - assert normalize_numbers('2000') == 'two thousand' - assert normalize_numbers('2004') == 'two thousand four' - assert normalize_numbers('2010') == 'twenty ten' - assert normalize_numbers('2012') == 'twenty twelve' - assert normalize_numbers('2025') == 'twenty twenty-five' - assert normalize_numbers('September 11, 2001') == 'September eleven, two thousand one' - assert normalize_numbers('July 26, 1984.') == 'July twenty-six, nineteen eighty-four.' + assert normalize_numbers('15th') == 'fifteenth' + assert normalize_numbers('212th street') == 'two hundred twelfth street' + assert normalize_numbers('243rd place') == 'two hundred forty third place' + assert normalize_numbers('1025th') == 'one thousand twenty fifth' + assert normalize_numbers('1000000th') == 'one millionth' def test_normalize_money(): @@ -43,9 +42,9 @@ def test_normalize_money(): assert normalize_numbers('$1') == 'one dollar' assert normalize_numbers('$10') == 'ten dollars' assert normalize_numbers('$.01') == 'one cent' - assert normalize_numbers('$0.25') == 'twenty-five cents' + assert normalize_numbers('$0.25') == 'twenty five cents' assert normalize_numbers('$5.00') == 'five dollars' assert normalize_numbers('$5.01') == 'five dollars, one cent' - assert normalize_numbers('$135.99.') == 'one hundred thirty-five dollars, ninety-nine cents.' + assert normalize_numbers('$135.99.') == 'one hundred thirty five dollars, ninety nine cents.' assert normalize_numbers('$40,000') == 'forty thousand dollars' - assert normalize_numbers('for £2500!') == 'for twenty-five hundred pounds!' + assert normalize_numbers('for £2500!') == 'for twenty five hundred pounds!' diff --git a/tests/text_test.py b/tests/text_test.py index 242a44c..9ce63b4 100644 --- a/tests/text_test.py +++ b/tests/text_test.py @@ -31,13 +31,6 @@ def test_collapse_whitespace(): assert cleaners.collapse_whitespace(' x. y, \tz') == ' x. y, z' -def test_convert_to_ascii(): - assert cleaners.convert_to_ascii("raison d'être") == "raison d'etre" - assert cleaners.convert_to_ascii('grüß gott') == 'gruss gott' - assert cleaners.convert_to_ascii('안녕') == 'annyeong' - assert cleaners.convert_to_ascii('Здравствуйте') == 'Zdravstvuite' - - def test_lowercase(): assert cleaners.lowercase('Happy Birthday!') == 'happy birthday!' assert cleaners.lowercase('CAFÉ') == 'café' @@ -48,13 +41,13 @@ def test_expand_abbreviations(): def test_expand_numbers(): - assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty-four pears' + assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty four pears' assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.' def test_cleaner_pipelines(): text = 'Mr. Müller ate 2 Apples' - assert cleaners.english_cleaners(text) == 'mister muller ate two apples' - assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples' + assert cleaners.english_cleaners(text) == 'mister mller ate two apples' + assert cleaners.transliteration_cleaners(text) == 'mr. mller ate 2 apples' assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples' diff --git a/text/cleaners.py b/text/cleaners.py index aa56c4c..77b1434 100644 --- a/text/cleaners.py +++ b/text/cleaners.py @@ -4,14 +4,11 @@ Cleaners are transformations that run over the input text at both training and e Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" hyperparameter. Some cleaners are English-specific. You'll typically want to use: 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update + 2. "basic_cleaners" if you do not want to transliterate (in this case, you should also update the symbols in symbols.py to match your data). ''' import re -from unidecode import unidecode from .numbers import normalize_numbers @@ -60,7 +57,7 @@ def collapse_whitespace(text): def convert_to_ascii(text): - return unidecode(text) + return re.sub(r'[^\x00-\x7F]+', '', text) # This simply strips non-ASCII characters. def basic_cleaners(text): diff --git a/text/numbers.py b/text/numbers.py index ba9eb74..8d49cd0 100644 --- a/text/numbers.py +++ b/text/numbers.py @@ -1,15 +1,68 @@ -import inflect import re -_inflect = inflect.engine() _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') -_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)') _number_re = re.compile(r'[0-9]+') +_units = [ + '', + 'one', + 'two', + 'three', + 'four', + 'five', + 'six', + 'seven', + 'eight', + 'nine', + 'ten', + 'eleven', + 'twelve', + 'thirteen', + 'fourteen', + 'fifteen', + 'sixteen', + 'seventeen', + 'eighteen', + 'nineteen' +] + +_tens = [ + '', + 'ten', + 'twenty', + 'thirty', + 'forty', + 'fifty', + 'sixty', + 'seventy', + 'eighty', + 'ninety', +] + +_digit_groups = [ + '', + 'thousand', + 'million', + 'billion', + 'trillion', + 'quadrillion', +] + +_ordinal_suffixes = [ + ('one', 'first'), + ('two', 'second'), + ('three', 'third'), + ('five', 'fifth'), + ('eight', 'eighth'), + ('nine', 'ninth'), + ('twelve', 'twelfth'), + ('ty', 'tieth'), +] def _remove_commas(m): return m.group(1).replace(',', '') @@ -40,23 +93,47 @@ def _expand_dollars(m): return 'zero dollars' -def _expand_ordinal(m): - return _inflect.number_to_words(m.group(0)) +def _standard_number_to_words(n, digit_group): + parts = [] + if n >= 1000: + # Format next higher digit group. + parts.append(_standard_number_to_words(n // 1000, digit_group + 1)) + n = n % 1000 + + if n >= 100: + parts.append('%s hundred' % _units[n // 100]) + if n % 100 >= len(_units): + parts.append(_tens[(n % 100) // 10]) + parts.append(_units[(n % 100) % 10]) + else: + parts.append(_units[n % 100]) + if n > 0: + parts.append(_digit_groups[digit_group]) + return ' '.join([x for x in parts if x]) + + +def _number_to_words(n): + # Handle special cases first, then go to the standard case: + if n >= 1000000000000000000: + return str(n) # Too large, just return the digits + elif n == 0: + return 'zero' + elif n % 100 == 0 and n % 1000 != 0 and n < 3000: + return _standard_number_to_words(n // 100, 0) + ' hundred' + else: + return _standard_number_to_words(n, 0) def _expand_number(m): - num = int(m.group(0)) - if num > 1000 and num < 3000: - if num == 2000: - return 'two thousand' - elif num > 2000 and num < 2010: - return 'two thousand ' + _inflect.number_to_words(num % 100) - elif num % 100 == 0: - return _inflect.number_to_words(num // 100) + ' hundred' - else: - return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') - else: - return _inflect.number_to_words(num, andword='') + return _number_to_words(int(m.group(0))) + + +def _expand_ordinal(m): + num = _number_to_words(int(m.group(1))) + for suffix, replacement in _ordinal_suffixes: + if num.endswith(suffix): + return num[:-len(suffix)] + replacement + return num + 'th' def normalize_numbers(text):