From e35859d0b07f86e878198880fde78cd9886a4b40 Mon Sep 17 00:00:00 2001 From: Keith Ito Date: Mon, 5 Feb 2018 16:34:12 -0800 Subject: [PATCH] Remove dependency on inflect --- requirements.txt | 1 - tests/numbers_test.py | 41 ++++++++-------- tests/text_test.py | 13 ++--- text/numbers.py | 111 +++++++++++++++++++++++++++++++++++------- 4 files changed, 117 insertions(+), 49 deletions(-) diff --git a/requirements.txt b/requirements.txt index e080c20..6fd141d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ # Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install # depends on your platform. It is assumed you have already installed tensorflow. falcon==1.2.0 -inflect==0.2.5 librosa==0.5.1 matplotlib==2.0.2 numpy==1.13.0 diff --git a/tests/numbers_test.py b/tests/numbers_test.py index 7fa6b60..a45b4ac 100644 --- a/tests/numbers_test.py +++ b/tests/numbers_test.py @@ -2,40 +2,39 @@ from text.numbers import normalize_numbers def test_normalize_numbers(): + assert normalize_numbers('0') == 'zero' assert normalize_numbers('1') == 'one' assert normalize_numbers('15') == 'fifteen' - assert normalize_numbers('24') == 'twenty-four' + assert normalize_numbers('24') == 'twenty four' assert normalize_numbers('100') == 'one hundred' assert normalize_numbers('101') == 'one hundred one' - assert normalize_numbers('456') == 'four hundred fifty-six' + assert normalize_numbers('456') == 'four hundred fifty six' assert normalize_numbers('1000') == 'one thousand' assert normalize_numbers('1800') == 'eighteen hundred' assert normalize_numbers('2,000') == 'two thousand' assert normalize_numbers('3000') == 'three thousand' assert normalize_numbers('18000') == 'eighteen thousand' - assert normalize_numbers('24,000') == 'twenty-four thousand' - assert normalize_numbers('124,001') == 'one hundred twenty-four thousand one' + assert normalize_numbers('24,000') == 'twenty four thousand' + assert normalize_numbers('124,001') == 'one hundred twenty four thousand one' + assert normalize_numbers('999,999') == 'nine hundred ninety nine thousand nine hundred ninety nine' + assert normalize_numbers('1000000002') == 'one billion two' + assert normalize_numbers('1200000000') == 'one billion two hundred million' + assert normalize_numbers('19800000004001') == 'nineteen trillion eight hundred billion four thousand one' + assert normalize_numbers('712000000000000000') == 'seven hundred twelve quadrillion' + assert normalize_numbers('1000000000000000000') == '1000000000000000000' assert normalize_numbers('6.4 sec') == 'six point four sec' def test_normalize_ordinals(): assert normalize_numbers('1st') == 'first' assert normalize_numbers('2nd') == 'second' + assert normalize_numbers('5th') == 'fifth' assert normalize_numbers('9th') == 'ninth' - assert normalize_numbers('243rd place') == 'two hundred and forty-third place' - - -def test_normalize_dates(): - assert normalize_numbers('1400') == 'fourteen hundred' - assert normalize_numbers('1901') == 'nineteen oh one' - assert normalize_numbers('1999') == 'nineteen ninety-nine' - assert normalize_numbers('2000') == 'two thousand' - assert normalize_numbers('2004') == 'two thousand four' - assert normalize_numbers('2010') == 'twenty ten' - assert normalize_numbers('2012') == 'twenty twelve' - assert normalize_numbers('2025') == 'twenty twenty-five' - assert normalize_numbers('September 11, 2001') == 'September eleven, two thousand one' - assert normalize_numbers('July 26, 1984.') == 'July twenty-six, nineteen eighty-four.' + assert normalize_numbers('15th') == 'fifteenth' + assert normalize_numbers('212th street') == 'two hundred twelfth street' + assert normalize_numbers('243rd place') == 'two hundred forty third place' + assert normalize_numbers('1025th') == 'one thousand twenty fifth' + assert normalize_numbers('1000000th') == 'one millionth' def test_normalize_money(): @@ -43,9 +42,9 @@ def test_normalize_money(): assert normalize_numbers('$1') == 'one dollar' assert normalize_numbers('$10') == 'ten dollars' assert normalize_numbers('$.01') == 'one cent' - assert normalize_numbers('$0.25') == 'twenty-five cents' + assert normalize_numbers('$0.25') == 'twenty five cents' assert normalize_numbers('$5.00') == 'five dollars' assert normalize_numbers('$5.01') == 'five dollars, one cent' - assert normalize_numbers('$135.99.') == 'one hundred thirty-five dollars, ninety-nine cents.' + assert normalize_numbers('$135.99.') == 'one hundred thirty five dollars, ninety nine cents.' assert normalize_numbers('$40,000') == 'forty thousand dollars' - assert normalize_numbers('for £2500!') == 'for twenty-five hundred pounds!' + assert normalize_numbers('for £2500!') == 'for twenty five hundred pounds!' diff --git a/tests/text_test.py b/tests/text_test.py index 242a44c..9ce63b4 100644 --- a/tests/text_test.py +++ b/tests/text_test.py @@ -31,13 +31,6 @@ def test_collapse_whitespace(): assert cleaners.collapse_whitespace(' x. y, \tz') == ' x. y, z' -def test_convert_to_ascii(): - assert cleaners.convert_to_ascii("raison d'être") == "raison d'etre" - assert cleaners.convert_to_ascii('grüß gott') == 'gruss gott' - assert cleaners.convert_to_ascii('안녕') == 'annyeong' - assert cleaners.convert_to_ascii('Здравствуйте') == 'Zdravstvuite' - - def test_lowercase(): assert cleaners.lowercase('Happy Birthday!') == 'happy birthday!' assert cleaners.lowercase('CAFÉ') == 'café' @@ -48,13 +41,13 @@ def test_expand_abbreviations(): def test_expand_numbers(): - assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty-four pears' + assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty four pears' assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.' def test_cleaner_pipelines(): text = 'Mr. Müller ate 2 Apples' - assert cleaners.english_cleaners(text) == 'mister muller ate two apples' - assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples' + assert cleaners.english_cleaners(text) == 'mister mller ate two apples' + assert cleaners.transliteration_cleaners(text) == 'mr. mller ate 2 apples' assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples' diff --git a/text/numbers.py b/text/numbers.py index ba9eb74..8d49cd0 100644 --- a/text/numbers.py +++ b/text/numbers.py @@ -1,15 +1,68 @@ -import inflect import re -_inflect = inflect.engine() _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') -_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)') _number_re = re.compile(r'[0-9]+') +_units = [ + '', + 'one', + 'two', + 'three', + 'four', + 'five', + 'six', + 'seven', + 'eight', + 'nine', + 'ten', + 'eleven', + 'twelve', + 'thirteen', + 'fourteen', + 'fifteen', + 'sixteen', + 'seventeen', + 'eighteen', + 'nineteen' +] + +_tens = [ + '', + 'ten', + 'twenty', + 'thirty', + 'forty', + 'fifty', + 'sixty', + 'seventy', + 'eighty', + 'ninety', +] + +_digit_groups = [ + '', + 'thousand', + 'million', + 'billion', + 'trillion', + 'quadrillion', +] + +_ordinal_suffixes = [ + ('one', 'first'), + ('two', 'second'), + ('three', 'third'), + ('five', 'fifth'), + ('eight', 'eighth'), + ('nine', 'ninth'), + ('twelve', 'twelfth'), + ('ty', 'tieth'), +] def _remove_commas(m): return m.group(1).replace(',', '') @@ -40,23 +93,47 @@ def _expand_dollars(m): return 'zero dollars' -def _expand_ordinal(m): - return _inflect.number_to_words(m.group(0)) +def _standard_number_to_words(n, digit_group): + parts = [] + if n >= 1000: + # Format next higher digit group. + parts.append(_standard_number_to_words(n // 1000, digit_group + 1)) + n = n % 1000 + + if n >= 100: + parts.append('%s hundred' % _units[n // 100]) + if n % 100 >= len(_units): + parts.append(_tens[(n % 100) // 10]) + parts.append(_units[(n % 100) % 10]) + else: + parts.append(_units[n % 100]) + if n > 0: + parts.append(_digit_groups[digit_group]) + return ' '.join([x for x in parts if x]) + + +def _number_to_words(n): + # Handle special cases first, then go to the standard case: + if n >= 1000000000000000000: + return str(n) # Too large, just return the digits + elif n == 0: + return 'zero' + elif n % 100 == 0 and n % 1000 != 0 and n < 3000: + return _standard_number_to_words(n // 100, 0) + ' hundred' + else: + return _standard_number_to_words(n, 0) def _expand_number(m): - num = int(m.group(0)) - if num > 1000 and num < 3000: - if num == 2000: - return 'two thousand' - elif num > 2000 and num < 2010: - return 'two thousand ' + _inflect.number_to_words(num % 100) - elif num % 100 == 0: - return _inflect.number_to_words(num // 100) + ' hundred' - else: - return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') - else: - return _inflect.number_to_words(num, andword='') + return _number_to_words(int(m.group(0))) + + +def _expand_ordinal(m): + num = _number_to_words(int(m.group(1))) + for suffix, replacement in _ordinal_suffixes: + if num.endswith(suffix): + return num[:-len(suffix)] + replacement + return num + 'th' def normalize_numbers(text):