feature/allow to pronounce ordinals and very small fractions (#1663)

* allow to pronounce ordinals * cleanup * long scale / short scale very small fractions
2018-07-10 08:54:04 +01:00 · 2018-07-10 08:54:04 +01:00 · 6cf2ed814c
parent 1093383443
commit 6cf2ed814c
3 changed files with 161 additions and 35 deletions
--- a/mycroft/util/lang/parse_en.py
+++ b/mycroft/util/lang/parse_en.py
@ -120,8 +120,94 @@ SHORT_SCALE_EN = {
    10e100: "googol"
 }

+SHORT_ORDINAL_STRING_EN = {
+    1: 'first',
+    2: 'second',
+    3: 'third',
+    4: 'fourth',
+    5: 'fifth',
+    6: 'sixth',
+    7: 'seventh',
+    8: 'eighth',
+    9: 'ninth',
+    10: 'tenth',
+    11: 'eleventh',
+    12: 'twelfth',
+    13: 'thirteenth',
+    14: 'fourteenth',
+    15: 'fifteenth',
+    16: 'sixteenth',
+    17: 'seventeenth',
+    18: 'eighteenth',
+    19: 'nineteenth',
+    20: 'twentieth',
+    30: 'thirtieth',
+    40: "fortieth",
+    50: "fiftieth",
+    60: "sixtieth",
+    70: "seventieth",
+    80: "eightieth",
+    90: "ninetieth",
+    10e3: "hundredth",
+    1e3: "thousandth",
+    1e6: "millionth",
+    1e9: "billionth",
+    1e12: "trillionth",
+    1e15: "quadrillionth",
+    1e18: "quintillionth",
+    1e21: "sextillionth",
+    1e24: "septillionth",
+    1e27: "octillionth",
+    1e30: "nonillionth",
+    1e33: "decillionth"
+    # TODO > 1e-33
+}

-def extractnumber_en(text, short_scale=True):
+LONG_ORDINAL_STRING_EN = {
+    1: 'first',
+    2: 'second',
+    3: 'third',
+    4: 'fourth',
+    5: 'fifth',
+    6: 'sixth',
+    7: 'seventh',
+    8: 'eighth',
+    9: 'ninth',
+    10: 'tenth',
+    11: 'eleventh',
+    12: 'twelfth',
+    13: 'thirteenth',
+    14: 'fourteenth',
+    15: 'fifteenth',
+    16: 'sixteenth',
+    17: 'seventeenth',
+    18: 'eighteenth',
+    19: 'nineteenth',
+    20: 'twentieth',
+    30: 'thirtieth',
+    40: "fortieth",
+    50: "fiftieth",
+    60: "sixtieth",
+    70: "seventieth",
+    80: "eightieth",
+    90: "ninetieth",
+    10e3: "hundredth",
+    1e3: "thousandth",
+    1e6: "millionth",
+    1e12: "billionth",
+    1e18: "trillionth",
+    1e24: "quadrillionth",
+    1e30: "quintillionth",
+    1e36: "sextillionth",
+    1e42: "septillionth",
+    1e48: "octillionth",
+    1e54: "nonillionth",
+    1e60: "decillionth"
+    # TODO > 1e60
+}
+
+
+def extractnumber_en(text, short_scale=True, ordinals=False):
    """
    This function extracts a number from a text string,
    handles pronunciations in long scale and short scale
@ -131,13 +217,14 @@ def extractnumber_en(text, short_scale=True):
    Args:
        text (str): the string to normalize
        short_scale (bool): use short scale if True, long scale if False
+        ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
    Returns:
        (int) or (float) or False: The extracted number or False if no number
                                   was found

    """
-    string_num_en = {"first": 1,
-                     "second": 2,
+
+    string_num_en = {
                     "half": 0.5,
                     "halves": 0.5,
                     "hundreds": 100,
@ -148,6 +235,17 @@ def extractnumber_en(text, short_scale=True):
        num_string = NUM_STRING_EN[num]
        string_num_en[num_string] = num

+    # first, second...
+    if ordinals:
+        if short_scale:
+            for num in SHORT_ORDINAL_STRING_EN:
+                num_string = SHORT_ORDINAL_STRING_EN[num]
+                string_num_en[num_string] = num
+        else:
+            for num in LONG_ORDINAL_STRING_EN:
+                num_string = LONG_ORDINAL_STRING_EN[num]
+                string_num_en[num_string] = num
+
    # negate next number (-2 = 0 - 2)
    negatives = ["negative", "minus"]

@ -196,16 +294,17 @@ def extractnumber_en(text, short_scale=True):
    for c in decimal_marker:
        components = text.split(c)
        if len(components) == 2:
-            if extractnumber_en(components[0]) is not None \
-                    and extractnumber_en(components[1]):
-                return extractnumber_en(components[0]) + float(
-                    "0." + str(extractnumber_en(components[1])).split(".")[0])
+            number = extractnumber_en(components[0])
+            decimal = extractnumber_en(components[1])
+            if number is not None and decimal is not None:
+                # TODO handle number dot number number number
+                if "." not in str(decimal):
+                    return number + float("0." + str(decimal))

    aWords = text.split()
    aWords = [word for word in aWords if word not in ["the", "a", "an"]]
    val = False
    prev_val = None
-    negative = False
    to_sum = []
    for idx, word in enumerate(aWords):

@ -225,8 +324,9 @@ def extractnumber_en(text, short_scale=True):

        # is the prev word a number and should we sum it?
        # twenty two, fifty six
-        if prev_word in sums:
-            val = prev_val + val
+        if prev_word in sums and word in string_num_en:
+            if val and val < 10:
+                val = prev_val + val

        # is the prev word a number and should we multiply it?
        # twenty hundred, six hundred
@ -238,18 +338,19 @@ def extractnumber_en(text, short_scale=True):
        # is this a spoken fraction?
        # half cup
        if val is False:
-            val = isFractional_en(word)
+            val = isFractional_en(word, short_scale=short_scale)

        # 2 fifths
-        next_value = isFractional_en(next_word)
-        if next_value:
-            if not val:
-                val = 1
-            val = val * next_value
+        if not ordinals:
+            next_value = isFractional_en(next_word, short_scale=short_scale)
+            if next_value:
+                if not val:
+                    val = 1
+                val = val * next_value

        # is this a negative number?
        if val and prev_word and prev_word in negatives:
-            negative = True
+            val = 0 - val

        # let's make sure it isn't a fraction
        if not val:
@ -260,7 +361,6 @@ def extractnumber_en(text, short_scale=True):

        else:
            prev_val = val
-
            # handle long numbers
            # six hundred sixty six
            # two million five hundred thousand
@ -272,8 +372,6 @@ def extractnumber_en(text, short_scale=True):
    if val is not None:
        for v in to_sum:
            val = val + v
-    if negative:
-        val = 0 - val
    return val


@ -899,12 +997,13 @@ def extract_datetime_en(string, currentDate=None):
    return [extractedDate, resultStr]


-def isFractional_en(input_str):
+def isFractional_en(input_str, short_scale=True):
    """
    This function takes the given text and checks if it is a fraction.

    Args:
        input_str (str): the string to check if fractional
+        short_scale (bool): use short scale if True, long scale if False
    Returns:
        (bool) or (float): False if not a fraction, otherwise the fraction

@ -912,14 +1011,18 @@ def isFractional_en(input_str):
    if input_str.endswith('s', -1):
        input_str = input_str[:len(input_str) - 1]  # e.g. "fifths"

-    aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth",
-             "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"]
-
-    if input_str.lower() in aFrac:
-        return 1.0 / (aFrac.index(input_str) + 1)
-    if input_str == "quarter":
-        return 1.0 / 4
+    fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4}
+    if short_scale:
+        for num in SHORT_ORDINAL_STRING_EN:
+            if num > 2:
+                fracts[SHORT_ORDINAL_STRING_EN[num]] = num
+    else:
+        for num in LONG_ORDINAL_STRING_EN:
+            if num > 2:
+                fracts[LONG_ORDINAL_STRING_EN[num]] = num

+    if input_str.lower() in fracts:
+        return 1.0 / fracts[input_str.lower()]
    return False


--- a/mycroft/util/parse.py
+++ b/mycroft/util/parse.py
@ -70,21 +70,21 @@ def match_one(query, choices):


 # TODO:18.08
-def extractnumber(text, short_scale=True, lang="en-us"):
+def extractnumber(text, short_scale=True, ordinals=False, lang="en-us"):
    """ Depreciated, replaced by extract_number. Will be removed
    in the 18.08b release.

    """
-    return extract_number(text, short_scale, lang)
+    return extract_number(text, short_scale, ordinals, lang)


-def extract_number(text, short_scale=True, lang="en-us"):
+def extract_number(text, short_scale=True, ordinals=False, lang="en-us"):
    """Takes in a string and extracts a number.
    Args:
        text (str): the string to extract a number from
        short_scale (bool): use short or long scale. See
            https://en.wikipedia.org/wiki/Names_of_large_numbers
-
+        ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
        lang (str): the code for the language text is in
    Returns:
        (int, float or False): The number extracted or False if the input
@ -93,7 +93,8 @@ def extract_number(text, short_scale=True, lang="en-us"):

    lang_lower = str(lang).lower()
    if lang_lower.startswith("en"):
-        return extractnumber_en(text, short_scale)
+        return extractnumber_en(text, short_scale=short_scale,
+                                ordinals=ordinals)
    elif lang_lower.startswith("pt"):
        return extractnumber_pt(text)
    elif lang_lower.startswith("it"):
--- a/test/unittests/util/test_parse.py
+++ b/test/unittests/util/test_parse.py
@ -60,10 +60,14 @@ class TestNormalize(unittest.TestCase):
                         "this is an extra test")

    def test_extractnumber(self):
-        self.assertEqual(extractnumber("this is the first test"), 1)
+        self.assertEqual(extractnumber("this is the first test",
+                                       ordinals=True), 1)
        self.assertEqual(extractnumber("this is 2 test"), 2)
-        self.assertEqual(extractnumber("this is second test"), 2)
+        self.assertEqual(extractnumber("this is second test",
+                                       ordinals=True), 2)
        self.assertEqual(extractnumber("this is the third test"), 1.0 / 3.0)
+        self.assertEqual(extractnumber("this is the third test",
+                                       ordinals=True), 3.0)
        self.assertEqual(extractnumber("this is test number 4"), 4)
        self.assertEqual(extractnumber("one third of a cup"), 1.0 / 3.0)
        self.assertEqual(extractnumber("three cups"), 3)
@ -102,6 +106,24 @@ class TestNormalize(unittest.TestCase):
        self.assertEqual(extractnumber("minus 2"), -2)
        self.assertEqual(extractnumber("negative seventy"), -70)
        self.assertEqual(extractnumber("thousand million"), 1000000000)
+        self.assertEqual(extractnumber("sixth third"),
+                         1 / 6 / 3)
+        self.assertEqual(extractnumber("sixth third", ordinals=True),
+                         3)
+        self.assertEqual(extractnumber("thirty second"), 30)
+        self.assertEqual(extractnumber("thirty second", ordinals=True), 32)
+        self.assertEqual(extractnumber("this is the billionth test",
+                                       ordinals=True), 1e09)
+        self.assertEqual(extractnumber("this is the billionth test"), 1e-9)
+        self.assertEqual(extractnumber("this is the billionth test",
+                                       ordinals=True,
+                                       short_scale=False), 1e12)
+        self.assertEqual(extractnumber("this is the billionth test",
+                                       short_scale=False), 1e-12)
+        # TODO handle this case
+        # self.assertEqual(
+        #    extractnumber("6 dot six six six"),
+        #    6.666)
        self.assertTrue(extractnumber("The tennis player is fast") is False)
        self.assertTrue(extractnumber("fraggle") is False)