From 6cf2ed814c8647b7f17b790a7c90169f891c5cc5 Mon Sep 17 00:00:00 2001 From: JarbasAI <33701864+JarbasAl@users.noreply.github.com> Date: Tue, 10 Jul 2018 08:54:04 +0100 Subject: [PATCH] feature/allow to pronounce ordinals and very small fractions (#1663) * allow to pronounce ordinals * cleanup * long scale / short scale very small fractions --- mycroft/util/lang/parse_en.py | 159 ++++++++++++++++++++++++------ mycroft/util/parse.py | 11 ++- test/unittests/util/test_parse.py | 26 ++++- 3 files changed, 161 insertions(+), 35 deletions(-) diff --git a/mycroft/util/lang/parse_en.py b/mycroft/util/lang/parse_en.py index 5e05196eae..91538fd78c 100644 --- a/mycroft/util/lang/parse_en.py +++ b/mycroft/util/lang/parse_en.py @@ -120,8 +120,94 @@ SHORT_SCALE_EN = { 10e100: "googol" } +SHORT_ORDINAL_STRING_EN = { + 1: 'first', + 2: 'second', + 3: 'third', + 4: 'fourth', + 5: 'fifth', + 6: 'sixth', + 7: 'seventh', + 8: 'eighth', + 9: 'ninth', + 10: 'tenth', + 11: 'eleventh', + 12: 'twelfth', + 13: 'thirteenth', + 14: 'fourteenth', + 15: 'fifteenth', + 16: 'sixteenth', + 17: 'seventeenth', + 18: 'eighteenth', + 19: 'nineteenth', + 20: 'twentieth', + 30: 'thirtieth', + 40: "fortieth", + 50: "fiftieth", + 60: "sixtieth", + 70: "seventieth", + 80: "eightieth", + 90: "ninetieth", + 10e3: "hundredth", + 1e3: "thousandth", + 1e6: "millionth", + 1e9: "billionth", + 1e12: "trillionth", + 1e15: "quadrillionth", + 1e18: "quintillionth", + 1e21: "sextillionth", + 1e24: "septillionth", + 1e27: "octillionth", + 1e30: "nonillionth", + 1e33: "decillionth" + # TODO > 1e-33 +} -def extractnumber_en(text, short_scale=True): +LONG_ORDINAL_STRING_EN = { + 1: 'first', + 2: 'second', + 3: 'third', + 4: 'fourth', + 5: 'fifth', + 6: 'sixth', + 7: 'seventh', + 8: 'eighth', + 9: 'ninth', + 10: 'tenth', + 11: 'eleventh', + 12: 'twelfth', + 13: 'thirteenth', + 14: 'fourteenth', + 15: 'fifteenth', + 16: 'sixteenth', + 17: 'seventeenth', + 18: 'eighteenth', + 19: 'nineteenth', + 20: 'twentieth', + 30: 'thirtieth', + 40: "fortieth", + 50: "fiftieth", + 60: "sixtieth", + 70: "seventieth", + 80: "eightieth", + 90: "ninetieth", + 10e3: "hundredth", + 1e3: "thousandth", + 1e6: "millionth", + 1e12: "billionth", + 1e18: "trillionth", + 1e24: "quadrillionth", + 1e30: "quintillionth", + 1e36: "sextillionth", + 1e42: "septillionth", + 1e48: "octillionth", + 1e54: "nonillionth", + 1e60: "decillionth" + # TODO > 1e60 +} + + +def extractnumber_en(text, short_scale=True, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -131,13 +217,14 @@ def extractnumber_en(text, short_scale=True): Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ - string_num_en = {"first": 1, - "second": 2, + + string_num_en = { "half": 0.5, "halves": 0.5, "hundreds": 100, @@ -148,6 +235,17 @@ def extractnumber_en(text, short_scale=True): num_string = NUM_STRING_EN[num] string_num_en[num_string] = num + # first, second... + if ordinals: + if short_scale: + for num in SHORT_ORDINAL_STRING_EN: + num_string = SHORT_ORDINAL_STRING_EN[num] + string_num_en[num_string] = num + else: + for num in LONG_ORDINAL_STRING_EN: + num_string = LONG_ORDINAL_STRING_EN[num] + string_num_en[num_string] = num + # negate next number (-2 = 0 - 2) negatives = ["negative", "minus"] @@ -196,16 +294,17 @@ def extractnumber_en(text, short_scale=True): for c in decimal_marker: components = text.split(c) if len(components) == 2: - if extractnumber_en(components[0]) is not None \ - and extractnumber_en(components[1]): - return extractnumber_en(components[0]) + float( - "0." + str(extractnumber_en(components[1])).split(".")[0]) + number = extractnumber_en(components[0]) + decimal = extractnumber_en(components[1]) + if number is not None and decimal is not None: + # TODO handle number dot number number number + if "." not in str(decimal): + return number + float("0." + str(decimal)) aWords = text.split() aWords = [word for word in aWords if word not in ["the", "a", "an"]] val = False prev_val = None - negative = False to_sum = [] for idx, word in enumerate(aWords): @@ -225,8 +324,9 @@ def extractnumber_en(text, short_scale=True): # is the prev word a number and should we sum it? # twenty two, fifty six - if prev_word in sums: - val = prev_val + val + if prev_word in sums and word in string_num_en: + if val and val < 10: + val = prev_val + val # is the prev word a number and should we multiply it? # twenty hundred, six hundred @@ -238,18 +338,19 @@ def extractnumber_en(text, short_scale=True): # is this a spoken fraction? # half cup if val is False: - val = isFractional_en(word) + val = isFractional_en(word, short_scale=short_scale) # 2 fifths - next_value = isFractional_en(next_word) - if next_value: - if not val: - val = 1 - val = val * next_value + if not ordinals: + next_value = isFractional_en(next_word, short_scale=short_scale) + if next_value: + if not val: + val = 1 + val = val * next_value # is this a negative number? if val and prev_word and prev_word in negatives: - negative = True + val = 0 - val # let's make sure it isn't a fraction if not val: @@ -260,7 +361,6 @@ def extractnumber_en(text, short_scale=True): else: prev_val = val - # handle long numbers # six hundred sixty six # two million five hundred thousand @@ -272,8 +372,6 @@ def extractnumber_en(text, short_scale=True): if val is not None: for v in to_sum: val = val + v - if negative: - val = 0 - val return val @@ -899,12 +997,13 @@ def extract_datetime_en(string, currentDate=None): return [extractedDate, resultStr] -def isFractional_en(input_str): +def isFractional_en(input_str, short_scale=True): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction @@ -912,14 +1011,18 @@ def isFractional_en(input_str): if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "fifths" - aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth", - "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"] - - if input_str.lower() in aFrac: - return 1.0 / (aFrac.index(input_str) + 1) - if input_str == "quarter": - return 1.0 / 4 + fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} + if short_scale: + for num in SHORT_ORDINAL_STRING_EN: + if num > 2: + fracts[SHORT_ORDINAL_STRING_EN[num]] = num + else: + for num in LONG_ORDINAL_STRING_EN: + if num > 2: + fracts[LONG_ORDINAL_STRING_EN[num]] = num + if input_str.lower() in fracts: + return 1.0 / fracts[input_str.lower()] return False diff --git a/mycroft/util/parse.py b/mycroft/util/parse.py index 8620b680a6..31fa7c849f 100644 --- a/mycroft/util/parse.py +++ b/mycroft/util/parse.py @@ -70,21 +70,21 @@ def match_one(query, choices): # TODO:18.08 -def extractnumber(text, short_scale=True, lang="en-us"): +def extractnumber(text, short_scale=True, ordinals=False, lang="en-us"): """ Depreciated, replaced by extract_number. Will be removed in the 18.08b release. """ - return extract_number(text, short_scale, lang) + return extract_number(text, short_scale, ordinals, lang) -def extract_number(text, short_scale=True, lang="en-us"): +def extract_number(text, short_scale=True, ordinals=False, lang="en-us"): """Takes in a string and extracts a number. Args: text (str): the string to extract a number from short_scale (bool): use short or long scale. See https://en.wikipedia.org/wiki/Names_of_large_numbers - + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 lang (str): the code for the language text is in Returns: (int, float or False): The number extracted or False if the input @@ -93,7 +93,8 @@ def extract_number(text, short_scale=True, lang="en-us"): lang_lower = str(lang).lower() if lang_lower.startswith("en"): - return extractnumber_en(text, short_scale) + return extractnumber_en(text, short_scale=short_scale, + ordinals=ordinals) elif lang_lower.startswith("pt"): return extractnumber_pt(text) elif lang_lower.startswith("it"): diff --git a/test/unittests/util/test_parse.py b/test/unittests/util/test_parse.py index 022baf7599..8e80133db7 100644 --- a/test/unittests/util/test_parse.py +++ b/test/unittests/util/test_parse.py @@ -60,10 +60,14 @@ class TestNormalize(unittest.TestCase): "this is an extra test") def test_extractnumber(self): - self.assertEqual(extractnumber("this is the first test"), 1) + self.assertEqual(extractnumber("this is the first test", + ordinals=True), 1) self.assertEqual(extractnumber("this is 2 test"), 2) - self.assertEqual(extractnumber("this is second test"), 2) + self.assertEqual(extractnumber("this is second test", + ordinals=True), 2) self.assertEqual(extractnumber("this is the third test"), 1.0 / 3.0) + self.assertEqual(extractnumber("this is the third test", + ordinals=True), 3.0) self.assertEqual(extractnumber("this is test number 4"), 4) self.assertEqual(extractnumber("one third of a cup"), 1.0 / 3.0) self.assertEqual(extractnumber("three cups"), 3) @@ -102,6 +106,24 @@ class TestNormalize(unittest.TestCase): self.assertEqual(extractnumber("minus 2"), -2) self.assertEqual(extractnumber("negative seventy"), -70) self.assertEqual(extractnumber("thousand million"), 1000000000) + self.assertEqual(extractnumber("sixth third"), + 1 / 6 / 3) + self.assertEqual(extractnumber("sixth third", ordinals=True), + 3) + self.assertEqual(extractnumber("thirty second"), 30) + self.assertEqual(extractnumber("thirty second", ordinals=True), 32) + self.assertEqual(extractnumber("this is the billionth test", + ordinals=True), 1e09) + self.assertEqual(extractnumber("this is the billionth test"), 1e-9) + self.assertEqual(extractnumber("this is the billionth test", + ordinals=True, + short_scale=False), 1e12) + self.assertEqual(extractnumber("this is the billionth test", + short_scale=False), 1e-12) + # TODO handle this case + # self.assertEqual( + # extractnumber("6 dot six six six"), + # 6.666) self.assertTrue(extractnumber("The tennis player is fast") is False) self.assertTrue(extractnumber("fraggle") is False)