mirror of https://github.com/MycroftAI/mimic2.git
Merge pull request #1 from MycroftAI/mycroft-changes
Apache license and remove non Apache-compatible codepull/2/head
commit
14960104e5
214
LICENSE
214
LICENSE
|
@ -1,19 +1,201 @@
|
|||
Copyright (c) 2017 Keith Ito
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
1. Definitions.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
|
170
README.md
170
README.md
|
@ -1,168 +1,6 @@
|
|||
# Tacotron
|
||||
# mimic2
|
||||
|
||||
An implementation of Tacotron speech synthesis in TensorFlow.
|
||||
This is a fork of [keithito/tacotron](https://github.com/keithito/tacotron)
|
||||
with changes specific to Mimic 2 applied.
|
||||
|
||||
|
||||
### Audio Samples
|
||||
|
||||
* **[Audio Samples](https://keithito.github.io/audio-samples/)** from models trained using this repo.
|
||||
* The first set was trained for 877K steps on the [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/)
|
||||
* Speech started to become intelligble around 20K steps.
|
||||
* Although loss continued to decrease, there wasn't much noticable improvement after ~250K steps.
|
||||
* The second set was trained by [@MXGray](https://github.com/MXGray) for 140K steps on the [Nancy Corpus](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/).
|
||||
|
||||
|
||||
|
||||
## Background
|
||||
|
||||
Earlier this year, Google published a paper, [Tacotron: A Fully End-to-End Text-To-Speech Synthesis Model](https://arxiv.org/pdf/1703.10135.pdf),
|
||||
where they present a neural text-to-speech model that learns to synthesize speech directly from
|
||||
(text, audio) pairs. However, they didn't release their source code or training data. This is an
|
||||
attempt to provide an open-source implementation of the model described in their paper.
|
||||
|
||||
The quality isn't as good as Google's demo yet, but hopefully it will get there someday :-).
|
||||
Pull requests are welcome!
|
||||
|
||||
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Installing dependencies
|
||||
|
||||
1. Install Python 3.
|
||||
|
||||
2. Install the latest version of [TensorFlow](https://www.tensorflow.org/install/) for your platform. For better
|
||||
performance, install with GPU support if it's available. This code works with TensorFlow 1.3 or 1.4.
|
||||
|
||||
3. Install requirements:
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
|
||||
### Using a pre-trained model
|
||||
|
||||
1. **Download and unpack a model**:
|
||||
```
|
||||
curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xjC /tmp
|
||||
```
|
||||
|
||||
2. **Run the demo server**:
|
||||
```
|
||||
python3 demo_server.py --checkpoint /tmp/tacotron-20170720/model.ckpt
|
||||
```
|
||||
|
||||
3. **Point your browser at localhost:9000**
|
||||
* Type what you want to synthesize
|
||||
|
||||
|
||||
|
||||
### Training
|
||||
|
||||
*Note: you need at least 40GB of free disk space to train a model.*
|
||||
|
||||
1. **Download a speech dataset.**
|
||||
|
||||
The following are supported out of the box:
|
||||
* [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain)
|
||||
* [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike)
|
||||
|
||||
You can use other datasets if you convert them to the right format. See [TRAINING_DATA.md](TRAINING_DATA.md) for more info.
|
||||
|
||||
|
||||
2. **Unpack the dataset into `~/tacotron`**
|
||||
|
||||
After unpacking, your tree should look like this for LJ Speech:
|
||||
```
|
||||
tacotron
|
||||
|- LJSpeech-1.0
|
||||
|- metadata.csv
|
||||
|- wavs
|
||||
```
|
||||
|
||||
or like this for Blizzard 2012:
|
||||
```
|
||||
tacotron
|
||||
|- Blizzard2012
|
||||
|- ATrampAbroad
|
||||
| |- sentence_index.txt
|
||||
| |- lab
|
||||
| |- wav
|
||||
|- TheManThatCorruptedHadleyburg
|
||||
|- sentence_index.txt
|
||||
|- lab
|
||||
|- wav
|
||||
```
|
||||
|
||||
3. **Preprocess the data**
|
||||
```
|
||||
python3 preprocess.py --dataset ljspeech
|
||||
```
|
||||
* Use `--dataset blizzard` for Blizzard data
|
||||
|
||||
4. **Train a model**
|
||||
```
|
||||
python3 train.py
|
||||
```
|
||||
|
||||
Tunable hyperparameters are found in [hparams.py](hparams.py). You can adjust these at the command
|
||||
line using the `--hparams` flag, for example `--hparams="batch_size=16,outputs_per_step=2"`.
|
||||
Hyperparameters should generally be set to the same values at both training and eval time.
|
||||
|
||||
|
||||
5. **Monitor with Tensorboard** (optional)
|
||||
```
|
||||
tensorboard --logdir ~/tacotron/logs-tacotron
|
||||
```
|
||||
|
||||
The trainer dumps audio and alignments every 1000 steps. You can find these in
|
||||
`~/tacotron/logs-tacotron`.
|
||||
|
||||
6. **Synthesize from a checkpoint**
|
||||
```
|
||||
python3 demo_server.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000
|
||||
```
|
||||
Replace "185000" with the checkpoint number that you want to use, then open a browser
|
||||
to `localhost:9000` and type what you want to speak. Alternately, you can
|
||||
run [eval.py](eval.py) at the command line:
|
||||
```
|
||||
python3 eval.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000
|
||||
```
|
||||
If you set the `--hparams` flag when training, set the same value here.
|
||||
|
||||
|
||||
## Notes and Common Issues
|
||||
|
||||
* [TCMalloc](http://goog-perftools.sourceforge.net/doc/tcmalloc.html) seems to improve
|
||||
training speed and avoids occasional slowdowns seen with the default allocator. You
|
||||
can enable it by installing it and setting `LD_PRELOAD=/usr/lib/libtcmalloc.so`.
|
||||
|
||||
* You can train with [CMUDict](http://www.speech.cs.cmu.edu/cgi-bin/cmudict) by downloading the
|
||||
dictionary to ~/tacotron/training and then passing the flag `--hparams="use_cmudict=True"` to
|
||||
train.py. This will allow you to pass ARPAbet phonemes enclosed in curly braces at eval
|
||||
time to force a particular pronunciation, e.g. `Turn left on {HH AW1 S S T AH0 N} Street.`
|
||||
|
||||
* If you pass a Slack incoming webhook URL as the `--slack_url` flag to train.py, it will send
|
||||
you progress updates every 1000 steps.
|
||||
|
||||
* Occasionally, you may see a spike in loss and the model will forget how to attend (the
|
||||
alignments will no longer make sense). Although it will recover eventually, it may
|
||||
save time to restart at a checkpoint prior to the spike by passing the
|
||||
`--restore_step=150000` flag to train.py (replacing 150000 with a step number prior to the
|
||||
spike). **Update**: a recent [fix](https://github.com/keithito/tacotron/pull/7) to gradient
|
||||
clipping by @candlewill may have fixed this.
|
||||
|
||||
* During eval and training, audio length is limited to `max_iters * outputs_per_step * frame_shift_ms`
|
||||
milliseconds. With the defaults (max_iters=200, outputs_per_step=5, frame_shift_ms=12.5), this is
|
||||
12.5 seconds.
|
||||
|
||||
If your training examples are longer, you will see an error like this:
|
||||
`Incompatible shapes: [32,1340,80] vs. [32,1000,80]`
|
||||
|
||||
To fix this, you can set a larger value of `max_iters` by passing `--hparams="max_iters=300"` to
|
||||
train.py (replace "300" with a value based on how long your audio is and the formula above).
|
||||
|
||||
|
||||
## Other Implementations
|
||||
* By Alex Barron: https://github.com/barronalex/Tacotron
|
||||
* By Kyubyong Park: https://github.com/Kyubyong/tacotron
|
||||
Copyright (c) 2017 Keith Ito
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
# Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install
|
||||
# depends on your platform. It is assumed you have already installed tensorflow.
|
||||
falcon==1.2.0
|
||||
inflect==0.2.5
|
||||
librosa==0.5.1
|
||||
matplotlib==2.0.2
|
||||
numpy==1.13.0
|
||||
scipy==0.19.0
|
||||
tqdm==4.11.2
|
||||
Unidecode==0.4.20
|
||||
|
|
|
@ -2,40 +2,39 @@ from text.numbers import normalize_numbers
|
|||
|
||||
|
||||
def test_normalize_numbers():
|
||||
assert normalize_numbers('0') == 'zero'
|
||||
assert normalize_numbers('1') == 'one'
|
||||
assert normalize_numbers('15') == 'fifteen'
|
||||
assert normalize_numbers('24') == 'twenty-four'
|
||||
assert normalize_numbers('24') == 'twenty four'
|
||||
assert normalize_numbers('100') == 'one hundred'
|
||||
assert normalize_numbers('101') == 'one hundred one'
|
||||
assert normalize_numbers('456') == 'four hundred fifty-six'
|
||||
assert normalize_numbers('456') == 'four hundred fifty six'
|
||||
assert normalize_numbers('1000') == 'one thousand'
|
||||
assert normalize_numbers('1800') == 'eighteen hundred'
|
||||
assert normalize_numbers('2,000') == 'two thousand'
|
||||
assert normalize_numbers('3000') == 'three thousand'
|
||||
assert normalize_numbers('18000') == 'eighteen thousand'
|
||||
assert normalize_numbers('24,000') == 'twenty-four thousand'
|
||||
assert normalize_numbers('124,001') == 'one hundred twenty-four thousand one'
|
||||
assert normalize_numbers('24,000') == 'twenty four thousand'
|
||||
assert normalize_numbers('124,001') == 'one hundred twenty four thousand one'
|
||||
assert normalize_numbers('999,999') == 'nine hundred ninety nine thousand nine hundred ninety nine'
|
||||
assert normalize_numbers('1000000002') == 'one billion two'
|
||||
assert normalize_numbers('1200000000') == 'one billion two hundred million'
|
||||
assert normalize_numbers('19800000004001') == 'nineteen trillion eight hundred billion four thousand one'
|
||||
assert normalize_numbers('712000000000000000') == 'seven hundred twelve quadrillion'
|
||||
assert normalize_numbers('1000000000000000000') == '1000000000000000000'
|
||||
assert normalize_numbers('6.4 sec') == 'six point four sec'
|
||||
|
||||
|
||||
def test_normalize_ordinals():
|
||||
assert normalize_numbers('1st') == 'first'
|
||||
assert normalize_numbers('2nd') == 'second'
|
||||
assert normalize_numbers('5th') == 'fifth'
|
||||
assert normalize_numbers('9th') == 'ninth'
|
||||
assert normalize_numbers('243rd place') == 'two hundred and forty-third place'
|
||||
|
||||
|
||||
def test_normalize_dates():
|
||||
assert normalize_numbers('1400') == 'fourteen hundred'
|
||||
assert normalize_numbers('1901') == 'nineteen oh one'
|
||||
assert normalize_numbers('1999') == 'nineteen ninety-nine'
|
||||
assert normalize_numbers('2000') == 'two thousand'
|
||||
assert normalize_numbers('2004') == 'two thousand four'
|
||||
assert normalize_numbers('2010') == 'twenty ten'
|
||||
assert normalize_numbers('2012') == 'twenty twelve'
|
||||
assert normalize_numbers('2025') == 'twenty twenty-five'
|
||||
assert normalize_numbers('September 11, 2001') == 'September eleven, two thousand one'
|
||||
assert normalize_numbers('July 26, 1984.') == 'July twenty-six, nineteen eighty-four.'
|
||||
assert normalize_numbers('15th') == 'fifteenth'
|
||||
assert normalize_numbers('212th street') == 'two hundred twelfth street'
|
||||
assert normalize_numbers('243rd place') == 'two hundred forty third place'
|
||||
assert normalize_numbers('1025th') == 'one thousand twenty fifth'
|
||||
assert normalize_numbers('1000000th') == 'one millionth'
|
||||
|
||||
|
||||
def test_normalize_money():
|
||||
|
@ -43,9 +42,9 @@ def test_normalize_money():
|
|||
assert normalize_numbers('$1') == 'one dollar'
|
||||
assert normalize_numbers('$10') == 'ten dollars'
|
||||
assert normalize_numbers('$.01') == 'one cent'
|
||||
assert normalize_numbers('$0.25') == 'twenty-five cents'
|
||||
assert normalize_numbers('$0.25') == 'twenty five cents'
|
||||
assert normalize_numbers('$5.00') == 'five dollars'
|
||||
assert normalize_numbers('$5.01') == 'five dollars, one cent'
|
||||
assert normalize_numbers('$135.99.') == 'one hundred thirty-five dollars, ninety-nine cents.'
|
||||
assert normalize_numbers('$135.99.') == 'one hundred thirty five dollars, ninety nine cents.'
|
||||
assert normalize_numbers('$40,000') == 'forty thousand dollars'
|
||||
assert normalize_numbers('for £2500!') == 'for twenty-five hundred pounds!'
|
||||
assert normalize_numbers('for £2500!') == 'for twenty five hundred pounds!'
|
||||
|
|
|
@ -31,13 +31,6 @@ def test_collapse_whitespace():
|
|||
assert cleaners.collapse_whitespace(' x. y, \tz') == ' x. y, z'
|
||||
|
||||
|
||||
def test_convert_to_ascii():
|
||||
assert cleaners.convert_to_ascii("raison d'être") == "raison d'etre"
|
||||
assert cleaners.convert_to_ascii('grüß gott') == 'gruss gott'
|
||||
assert cleaners.convert_to_ascii('안녕') == 'annyeong'
|
||||
assert cleaners.convert_to_ascii('Здравствуйте') == 'Zdravstvuite'
|
||||
|
||||
|
||||
def test_lowercase():
|
||||
assert cleaners.lowercase('Happy Birthday!') == 'happy birthday!'
|
||||
assert cleaners.lowercase('CAFÉ') == 'café'
|
||||
|
@ -48,13 +41,13 @@ def test_expand_abbreviations():
|
|||
|
||||
|
||||
def test_expand_numbers():
|
||||
assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty-four pears'
|
||||
assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty four pears'
|
||||
assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.'
|
||||
|
||||
|
||||
def test_cleaner_pipelines():
|
||||
text = 'Mr. Müller ate 2 Apples'
|
||||
assert cleaners.english_cleaners(text) == 'mister muller ate two apples'
|
||||
assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples'
|
||||
assert cleaners.english_cleaners(text) == 'mister mller ate two apples'
|
||||
assert cleaners.transliteration_cleaners(text) == 'mr. mller ate 2 apples'
|
||||
assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples'
|
||||
|
||||
|
|
|
@ -4,14 +4,11 @@ Cleaners are transformations that run over the input text at both training and e
|
|||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
2. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
'''
|
||||
|
||||
import re
|
||||
from unidecode import unidecode
|
||||
from .numbers import normalize_numbers
|
||||
|
||||
|
||||
|
@ -60,7 +57,7 @@ def collapse_whitespace(text):
|
|||
|
||||
|
||||
def convert_to_ascii(text):
|
||||
return unidecode(text)
|
||||
return re.sub(r'[^\x00-\x7F]+', '', text) # This simply strips non-ASCII characters.
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
|
|
111
text/numbers.py
111
text/numbers.py
|
@ -1,15 +1,68 @@
|
|||
import inflect
|
||||
import re
|
||||
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
||||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||
_ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)')
|
||||
_number_re = re.compile(r'[0-9]+')
|
||||
|
||||
_units = [
|
||||
'',
|
||||
'one',
|
||||
'two',
|
||||
'three',
|
||||
'four',
|
||||
'five',
|
||||
'six',
|
||||
'seven',
|
||||
'eight',
|
||||
'nine',
|
||||
'ten',
|
||||
'eleven',
|
||||
'twelve',
|
||||
'thirteen',
|
||||
'fourteen',
|
||||
'fifteen',
|
||||
'sixteen',
|
||||
'seventeen',
|
||||
'eighteen',
|
||||
'nineteen'
|
||||
]
|
||||
|
||||
_tens = [
|
||||
'',
|
||||
'ten',
|
||||
'twenty',
|
||||
'thirty',
|
||||
'forty',
|
||||
'fifty',
|
||||
'sixty',
|
||||
'seventy',
|
||||
'eighty',
|
||||
'ninety',
|
||||
]
|
||||
|
||||
_digit_groups = [
|
||||
'',
|
||||
'thousand',
|
||||
'million',
|
||||
'billion',
|
||||
'trillion',
|
||||
'quadrillion',
|
||||
]
|
||||
|
||||
_ordinal_suffixes = [
|
||||
('one', 'first'),
|
||||
('two', 'second'),
|
||||
('three', 'third'),
|
||||
('five', 'fifth'),
|
||||
('eight', 'eighth'),
|
||||
('nine', 'ninth'),
|
||||
('twelve', 'twelfth'),
|
||||
('ty', 'tieth'),
|
||||
]
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(',', '')
|
||||
|
@ -40,23 +93,47 @@ def _expand_dollars(m):
|
|||
return 'zero dollars'
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
def _standard_number_to_words(n, digit_group):
|
||||
parts = []
|
||||
if n >= 1000:
|
||||
# Format next higher digit group.
|
||||
parts.append(_standard_number_to_words(n // 1000, digit_group + 1))
|
||||
n = n % 1000
|
||||
|
||||
if n >= 100:
|
||||
parts.append('%s hundred' % _units[n // 100])
|
||||
if n % 100 >= len(_units):
|
||||
parts.append(_tens[(n % 100) // 10])
|
||||
parts.append(_units[(n % 100) % 10])
|
||||
else:
|
||||
parts.append(_units[n % 100])
|
||||
if n > 0:
|
||||
parts.append(_digit_groups[digit_group])
|
||||
return ' '.join([x for x in parts if x])
|
||||
|
||||
|
||||
def _number_to_words(n):
|
||||
# Handle special cases first, then go to the standard case:
|
||||
if n >= 1000000000000000000:
|
||||
return str(n) # Too large, just return the digits
|
||||
elif n == 0:
|
||||
return 'zero'
|
||||
elif n % 100 == 0 and n % 1000 != 0 and n < 3000:
|
||||
return _standard_number_to_words(n // 100, 0) + ' hundred'
|
||||
else:
|
||||
return _standard_number_to_words(n, 0)
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return 'two thousand'
|
||||
elif num > 2000 and num < 2010:
|
||||
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='')
|
||||
return _number_to_words(int(m.group(0)))
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
num = _number_to_words(int(m.group(1)))
|
||||
for suffix, replacement in _ordinal_suffixes:
|
||||
if num.endswith(suffix):
|
||||
return num[:-len(suffix)] + replacement
|
||||
return num + 'th'
|
||||
|
||||
|
||||
def normalize_numbers(text):
|
||||
|
|
Loading…
Reference in New Issue