Remove dependency on inflect

pull/1/head
Keith Ito 2018-02-05 16:34:12 -08:00
parent 92b4379165
commit e35859d0b0
4 changed files with 117 additions and 49 deletions

View File

@ -1,7 +1,6 @@
# Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install
# depends on your platform. It is assumed you have already installed tensorflow.
falcon==1.2.0
inflect==0.2.5
librosa==0.5.1
matplotlib==2.0.2
numpy==1.13.0

View File

@ -2,40 +2,39 @@ from text.numbers import normalize_numbers
def test_normalize_numbers():
assert normalize_numbers('0') == 'zero'
assert normalize_numbers('1') == 'one'
assert normalize_numbers('15') == 'fifteen'
assert normalize_numbers('24') == 'twenty-four'
assert normalize_numbers('24') == 'twenty four'
assert normalize_numbers('100') == 'one hundred'
assert normalize_numbers('101') == 'one hundred one'
assert normalize_numbers('456') == 'four hundred fifty-six'
assert normalize_numbers('456') == 'four hundred fifty six'
assert normalize_numbers('1000') == 'one thousand'
assert normalize_numbers('1800') == 'eighteen hundred'
assert normalize_numbers('2,000') == 'two thousand'
assert normalize_numbers('3000') == 'three thousand'
assert normalize_numbers('18000') == 'eighteen thousand'
assert normalize_numbers('24,000') == 'twenty-four thousand'
assert normalize_numbers('124,001') == 'one hundred twenty-four thousand one'
assert normalize_numbers('24,000') == 'twenty four thousand'
assert normalize_numbers('124,001') == 'one hundred twenty four thousand one'
assert normalize_numbers('999,999') == 'nine hundred ninety nine thousand nine hundred ninety nine'
assert normalize_numbers('1000000002') == 'one billion two'
assert normalize_numbers('1200000000') == 'one billion two hundred million'
assert normalize_numbers('19800000004001') == 'nineteen trillion eight hundred billion four thousand one'
assert normalize_numbers('712000000000000000') == 'seven hundred twelve quadrillion'
assert normalize_numbers('1000000000000000000') == '1000000000000000000'
assert normalize_numbers('6.4 sec') == 'six point four sec'
def test_normalize_ordinals():
assert normalize_numbers('1st') == 'first'
assert normalize_numbers('2nd') == 'second'
assert normalize_numbers('5th') == 'fifth'
assert normalize_numbers('9th') == 'ninth'
assert normalize_numbers('243rd place') == 'two hundred and forty-third place'
def test_normalize_dates():
assert normalize_numbers('1400') == 'fourteen hundred'
assert normalize_numbers('1901') == 'nineteen oh one'
assert normalize_numbers('1999') == 'nineteen ninety-nine'
assert normalize_numbers('2000') == 'two thousand'
assert normalize_numbers('2004') == 'two thousand four'
assert normalize_numbers('2010') == 'twenty ten'
assert normalize_numbers('2012') == 'twenty twelve'
assert normalize_numbers('2025') == 'twenty twenty-five'
assert normalize_numbers('September 11, 2001') == 'September eleven, two thousand one'
assert normalize_numbers('July 26, 1984.') == 'July twenty-six, nineteen eighty-four.'
assert normalize_numbers('15th') == 'fifteenth'
assert normalize_numbers('212th street') == 'two hundred twelfth street'
assert normalize_numbers('243rd place') == 'two hundred forty third place'
assert normalize_numbers('1025th') == 'one thousand twenty fifth'
assert normalize_numbers('1000000th') == 'one millionth'
def test_normalize_money():
@ -43,9 +42,9 @@ def test_normalize_money():
assert normalize_numbers('$1') == 'one dollar'
assert normalize_numbers('$10') == 'ten dollars'
assert normalize_numbers('$.01') == 'one cent'
assert normalize_numbers('$0.25') == 'twenty-five cents'
assert normalize_numbers('$0.25') == 'twenty five cents'
assert normalize_numbers('$5.00') == 'five dollars'
assert normalize_numbers('$5.01') == 'five dollars, one cent'
assert normalize_numbers('$135.99.') == 'one hundred thirty-five dollars, ninety-nine cents.'
assert normalize_numbers('$135.99.') == 'one hundred thirty five dollars, ninety nine cents.'
assert normalize_numbers('$40,000') == 'forty thousand dollars'
assert normalize_numbers('for £2500!') == 'for twenty-five hundred pounds!'
assert normalize_numbers('for £2500!') == 'for twenty five hundred pounds!'

View File

@ -31,13 +31,6 @@ def test_collapse_whitespace():
assert cleaners.collapse_whitespace(' x. y, \tz') == ' x. y, z'
def test_convert_to_ascii():
assert cleaners.convert_to_ascii("raison d'être") == "raison d'etre"
assert cleaners.convert_to_ascii('grüß gott') == 'gruss gott'
assert cleaners.convert_to_ascii('안녕') == 'annyeong'
assert cleaners.convert_to_ascii('Здравствуйте') == 'Zdravstvuite'
def test_lowercase():
assert cleaners.lowercase('Happy Birthday!') == 'happy birthday!'
assert cleaners.lowercase('CAFÉ') == 'café'
@ -48,13 +41,13 @@ def test_expand_abbreviations():
def test_expand_numbers():
assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty-four pears'
assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty four pears'
assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.'
def test_cleaner_pipelines():
text = 'Mr. Müller ate 2 Apples'
assert cleaners.english_cleaners(text) == 'mister muller ate two apples'
assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples'
assert cleaners.english_cleaners(text) == 'mister mller ate two apples'
assert cleaners.transliteration_cleaners(text) == 'mr. mller ate 2 apples'
assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples'

View File

@ -1,15 +1,68 @@
import inflect
import re
_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)')
_number_re = re.compile(r'[0-9]+')
_units = [
'',
'one',
'two',
'three',
'four',
'five',
'six',
'seven',
'eight',
'nine',
'ten',
'eleven',
'twelve',
'thirteen',
'fourteen',
'fifteen',
'sixteen',
'seventeen',
'eighteen',
'nineteen'
]
_tens = [
'',
'ten',
'twenty',
'thirty',
'forty',
'fifty',
'sixty',
'seventy',
'eighty',
'ninety',
]
_digit_groups = [
'',
'thousand',
'million',
'billion',
'trillion',
'quadrillion',
]
_ordinal_suffixes = [
('one', 'first'),
('two', 'second'),
('three', 'third'),
('five', 'fifth'),
('eight', 'eighth'),
('nine', 'ninth'),
('twelve', 'twelfth'),
('ty', 'tieth'),
]
def _remove_commas(m):
return m.group(1).replace(',', '')
@ -40,23 +93,47 @@ def _expand_dollars(m):
return 'zero dollars'
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _standard_number_to_words(n, digit_group):
parts = []
if n >= 1000:
# Format next higher digit group.
parts.append(_standard_number_to_words(n // 1000, digit_group + 1))
n = n % 1000
if n >= 100:
parts.append('%s hundred' % _units[n // 100])
if n % 100 >= len(_units):
parts.append(_tens[(n % 100) // 10])
parts.append(_units[(n % 100) % 10])
else:
parts.append(_units[n % 100])
if n > 0:
parts.append(_digit_groups[digit_group])
return ' '.join([x for x in parts if x])
def _number_to_words(n):
# Handle special cases first, then go to the standard case:
if n >= 1000000000000000000:
return str(n) # Too large, just return the digits
elif n == 0:
return 'zero'
elif n % 100 == 0 and n % 1000 != 0 and n < 3000:
return _standard_number_to_words(n // 100, 0) + ' hundred'
else:
return _standard_number_to_words(n, 0)
def _expand_number(m):
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return 'two thousand'
elif num > 2000 and num < 2010:
return 'two thousand ' + _inflect.number_to_words(num % 100)
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred'
else:
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
else:
return _inflect.number_to_words(num, andword='')
return _number_to_words(int(m.group(0)))
def _expand_ordinal(m):
num = _number_to_words(int(m.group(1)))
for suffix, replacement in _ordinal_suffixes:
if num.endswith(suffix):
return num[:-len(suffix)] + replacement
return num + 'th'
def normalize_numbers(text):