feature/allow to pronounce ordinals and very small fractions (#1663)

* allow to pronounce ordinals

* cleanup

* long scale / short scale very small fractions
pull/1680/head
JarbasAI 2018-07-10 08:54:04 +01:00 committed by Steve Penrod
parent 1093383443
commit 6cf2ed814c
3 changed files with 161 additions and 35 deletions

View File

@ -120,8 +120,94 @@ SHORT_SCALE_EN = {
10e100: "googol"
}
SHORT_ORDINAL_STRING_EN = {
1: 'first',
2: 'second',
3: 'third',
4: 'fourth',
5: 'fifth',
6: 'sixth',
7: 'seventh',
8: 'eighth',
9: 'ninth',
10: 'tenth',
11: 'eleventh',
12: 'twelfth',
13: 'thirteenth',
14: 'fourteenth',
15: 'fifteenth',
16: 'sixteenth',
17: 'seventeenth',
18: 'eighteenth',
19: 'nineteenth',
20: 'twentieth',
30: 'thirtieth',
40: "fortieth",
50: "fiftieth",
60: "sixtieth",
70: "seventieth",
80: "eightieth",
90: "ninetieth",
10e3: "hundredth",
1e3: "thousandth",
1e6: "millionth",
1e9: "billionth",
1e12: "trillionth",
1e15: "quadrillionth",
1e18: "quintillionth",
1e21: "sextillionth",
1e24: "septillionth",
1e27: "octillionth",
1e30: "nonillionth",
1e33: "decillionth"
# TODO > 1e-33
}
def extractnumber_en(text, short_scale=True):
LONG_ORDINAL_STRING_EN = {
1: 'first',
2: 'second',
3: 'third',
4: 'fourth',
5: 'fifth',
6: 'sixth',
7: 'seventh',
8: 'eighth',
9: 'ninth',
10: 'tenth',
11: 'eleventh',
12: 'twelfth',
13: 'thirteenth',
14: 'fourteenth',
15: 'fifteenth',
16: 'sixteenth',
17: 'seventeenth',
18: 'eighteenth',
19: 'nineteenth',
20: 'twentieth',
30: 'thirtieth',
40: "fortieth",
50: "fiftieth",
60: "sixtieth",
70: "seventieth",
80: "eightieth",
90: "ninetieth",
10e3: "hundredth",
1e3: "thousandth",
1e6: "millionth",
1e12: "billionth",
1e18: "trillionth",
1e24: "quadrillionth",
1e30: "quintillionth",
1e36: "sextillionth",
1e42: "septillionth",
1e48: "octillionth",
1e54: "nonillionth",
1e60: "decillionth"
# TODO > 1e60
}
def extractnumber_en(text, short_scale=True, ordinals=False):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
@ -131,13 +217,14 @@ def extractnumber_en(text, short_scale=True):
Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
Returns:
(int) or (float) or False: The extracted number or False if no number
was found
"""
string_num_en = {"first": 1,
"second": 2,
string_num_en = {
"half": 0.5,
"halves": 0.5,
"hundreds": 100,
@ -148,6 +235,17 @@ def extractnumber_en(text, short_scale=True):
num_string = NUM_STRING_EN[num]
string_num_en[num_string] = num
# first, second...
if ordinals:
if short_scale:
for num in SHORT_ORDINAL_STRING_EN:
num_string = SHORT_ORDINAL_STRING_EN[num]
string_num_en[num_string] = num
else:
for num in LONG_ORDINAL_STRING_EN:
num_string = LONG_ORDINAL_STRING_EN[num]
string_num_en[num_string] = num
# negate next number (-2 = 0 - 2)
negatives = ["negative", "minus"]
@ -196,16 +294,17 @@ def extractnumber_en(text, short_scale=True):
for c in decimal_marker:
components = text.split(c)
if len(components) == 2:
if extractnumber_en(components[0]) is not None \
and extractnumber_en(components[1]):
return extractnumber_en(components[0]) + float(
"0." + str(extractnumber_en(components[1])).split(".")[0])
number = extractnumber_en(components[0])
decimal = extractnumber_en(components[1])
if number is not None and decimal is not None:
# TODO handle number dot number number number
if "." not in str(decimal):
return number + float("0." + str(decimal))
aWords = text.split()
aWords = [word for word in aWords if word not in ["the", "a", "an"]]
val = False
prev_val = None
negative = False
to_sum = []
for idx, word in enumerate(aWords):
@ -225,8 +324,9 @@ def extractnumber_en(text, short_scale=True):
# is the prev word a number and should we sum it?
# twenty two, fifty six
if prev_word in sums:
val = prev_val + val
if prev_word in sums and word in string_num_en:
if val and val < 10:
val = prev_val + val
# is the prev word a number and should we multiply it?
# twenty hundred, six hundred
@ -238,18 +338,19 @@ def extractnumber_en(text, short_scale=True):
# is this a spoken fraction?
# half cup
if val is False:
val = isFractional_en(word)
val = isFractional_en(word, short_scale=short_scale)
# 2 fifths
next_value = isFractional_en(next_word)
if next_value:
if not val:
val = 1
val = val * next_value
if not ordinals:
next_value = isFractional_en(next_word, short_scale=short_scale)
if next_value:
if not val:
val = 1
val = val * next_value
# is this a negative number?
if val and prev_word and prev_word in negatives:
negative = True
val = 0 - val
# let's make sure it isn't a fraction
if not val:
@ -260,7 +361,6 @@ def extractnumber_en(text, short_scale=True):
else:
prev_val = val
# handle long numbers
# six hundred sixty six
# two million five hundred thousand
@ -272,8 +372,6 @@ def extractnumber_en(text, short_scale=True):
if val is not None:
for v in to_sum:
val = val + v
if negative:
val = 0 - val
return val
@ -899,12 +997,13 @@ def extract_datetime_en(string, currentDate=None):
return [extractedDate, resultStr]
def isFractional_en(input_str):
def isFractional_en(input_str, short_scale=True):
"""
This function takes the given text and checks if it is a fraction.
Args:
input_str (str): the string to check if fractional
short_scale (bool): use short scale if True, long scale if False
Returns:
(bool) or (float): False if not a fraction, otherwise the fraction
@ -912,14 +1011,18 @@ def isFractional_en(input_str):
if input_str.endswith('s', -1):
input_str = input_str[:len(input_str) - 1] # e.g. "fifths"
aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth",
"seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"]
if input_str.lower() in aFrac:
return 1.0 / (aFrac.index(input_str) + 1)
if input_str == "quarter":
return 1.0 / 4
fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4}
if short_scale:
for num in SHORT_ORDINAL_STRING_EN:
if num > 2:
fracts[SHORT_ORDINAL_STRING_EN[num]] = num
else:
for num in LONG_ORDINAL_STRING_EN:
if num > 2:
fracts[LONG_ORDINAL_STRING_EN[num]] = num
if input_str.lower() in fracts:
return 1.0 / fracts[input_str.lower()]
return False

View File

@ -70,21 +70,21 @@ def match_one(query, choices):
# TODO:18.08
def extractnumber(text, short_scale=True, lang="en-us"):
def extractnumber(text, short_scale=True, ordinals=False, lang="en-us"):
""" Depreciated, replaced by extract_number. Will be removed
in the 18.08b release.
"""
return extract_number(text, short_scale, lang)
return extract_number(text, short_scale, ordinals, lang)
def extract_number(text, short_scale=True, lang="en-us"):
def extract_number(text, short_scale=True, ordinals=False, lang="en-us"):
"""Takes in a string and extracts a number.
Args:
text (str): the string to extract a number from
short_scale (bool): use short or long scale. See
https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
lang (str): the code for the language text is in
Returns:
(int, float or False): The number extracted or False if the input
@ -93,7 +93,8 @@ def extract_number(text, short_scale=True, lang="en-us"):
lang_lower = str(lang).lower()
if lang_lower.startswith("en"):
return extractnumber_en(text, short_scale)
return extractnumber_en(text, short_scale=short_scale,
ordinals=ordinals)
elif lang_lower.startswith("pt"):
return extractnumber_pt(text)
elif lang_lower.startswith("it"):

View File

@ -60,10 +60,14 @@ class TestNormalize(unittest.TestCase):
"this is an extra test")
def test_extractnumber(self):
self.assertEqual(extractnumber("this is the first test"), 1)
self.assertEqual(extractnumber("this is the first test",
ordinals=True), 1)
self.assertEqual(extractnumber("this is 2 test"), 2)
self.assertEqual(extractnumber("this is second test"), 2)
self.assertEqual(extractnumber("this is second test",
ordinals=True), 2)
self.assertEqual(extractnumber("this is the third test"), 1.0 / 3.0)
self.assertEqual(extractnumber("this is the third test",
ordinals=True), 3.0)
self.assertEqual(extractnumber("this is test number 4"), 4)
self.assertEqual(extractnumber("one third of a cup"), 1.0 / 3.0)
self.assertEqual(extractnumber("three cups"), 3)
@ -102,6 +106,24 @@ class TestNormalize(unittest.TestCase):
self.assertEqual(extractnumber("minus 2"), -2)
self.assertEqual(extractnumber("negative seventy"), -70)
self.assertEqual(extractnumber("thousand million"), 1000000000)
self.assertEqual(extractnumber("sixth third"),
1 / 6 / 3)
self.assertEqual(extractnumber("sixth third", ordinals=True),
3)
self.assertEqual(extractnumber("thirty second"), 30)
self.assertEqual(extractnumber("thirty second", ordinals=True), 32)
self.assertEqual(extractnumber("this is the billionth test",
ordinals=True), 1e09)
self.assertEqual(extractnumber("this is the billionth test"), 1e-9)
self.assertEqual(extractnumber("this is the billionth test",
ordinals=True,
short_scale=False), 1e12)
self.assertEqual(extractnumber("this is the billionth test",
short_scale=False), 1e-12)
# TODO handle this case
# self.assertEqual(
# extractnumber("6 dot six six six"),
# 6.666)
self.assertTrue(extractnumber("The tennis player is fast") is False)
self.assertTrue(extractnumber("fraggle") is False)