From 6cf2ed814c8647b7f17b790a7c90169f891c5cc5 Mon Sep 17 00:00:00 2001
From: JarbasAI <33701864+JarbasAl@users.noreply.github.com>
Date: Tue, 10 Jul 2018 08:54:04 +0100
Subject: [PATCH] feature/allow to pronounce ordinals and very small fractions
 (#1663)

* allow to pronounce ordinals

* cleanup

* long scale / short scale very small fractions
---
 mycroft/util/lang/parse_en.py     | 159 ++++++++++++++++++++++++------
 mycroft/util/parse.py             |  11 ++-
 test/unittests/util/test_parse.py |  26 ++++-
 3 files changed, 161 insertions(+), 35 deletions(-)

diff --git a/mycroft/util/lang/parse_en.py b/mycroft/util/lang/parse_en.py
index 5e05196eae..91538fd78c 100644
--- a/mycroft/util/lang/parse_en.py
+++ b/mycroft/util/lang/parse_en.py
@@ -120,8 +120,94 @@ SHORT_SCALE_EN = {
     10e100: "googol"
 }
 
+SHORT_ORDINAL_STRING_EN = {
+    1: 'first',
+    2: 'second',
+    3: 'third',
+    4: 'fourth',
+    5: 'fifth',
+    6: 'sixth',
+    7: 'seventh',
+    8: 'eighth',
+    9: 'ninth',
+    10: 'tenth',
+    11: 'eleventh',
+    12: 'twelfth',
+    13: 'thirteenth',
+    14: 'fourteenth',
+    15: 'fifteenth',
+    16: 'sixteenth',
+    17: 'seventeenth',
+    18: 'eighteenth',
+    19: 'nineteenth',
+    20: 'twentieth',
+    30: 'thirtieth',
+    40: "fortieth",
+    50: "fiftieth",
+    60: "sixtieth",
+    70: "seventieth",
+    80: "eightieth",
+    90: "ninetieth",
+    10e3: "hundredth",
+    1e3: "thousandth",
+    1e6: "millionth",
+    1e9: "billionth",
+    1e12: "trillionth",
+    1e15: "quadrillionth",
+    1e18: "quintillionth",
+    1e21: "sextillionth",
+    1e24: "septillionth",
+    1e27: "octillionth",
+    1e30: "nonillionth",
+    1e33: "decillionth"
+    # TODO > 1e-33
+}
 
-def extractnumber_en(text, short_scale=True):
+LONG_ORDINAL_STRING_EN = {
+    1: 'first',
+    2: 'second',
+    3: 'third',
+    4: 'fourth',
+    5: 'fifth',
+    6: 'sixth',
+    7: 'seventh',
+    8: 'eighth',
+    9: 'ninth',
+    10: 'tenth',
+    11: 'eleventh',
+    12: 'twelfth',
+    13: 'thirteenth',
+    14: 'fourteenth',
+    15: 'fifteenth',
+    16: 'sixteenth',
+    17: 'seventeenth',
+    18: 'eighteenth',
+    19: 'nineteenth',
+    20: 'twentieth',
+    30: 'thirtieth',
+    40: "fortieth",
+    50: "fiftieth",
+    60: "sixtieth",
+    70: "seventieth",
+    80: "eightieth",
+    90: "ninetieth",
+    10e3: "hundredth",
+    1e3: "thousandth",
+    1e6: "millionth",
+    1e12: "billionth",
+    1e18: "trillionth",
+    1e24: "quadrillionth",
+    1e30: "quintillionth",
+    1e36: "sextillionth",
+    1e42: "septillionth",
+    1e48: "octillionth",
+    1e54: "nonillionth",
+    1e60: "decillionth"
+    # TODO > 1e60
+}
+
+
+def extractnumber_en(text, short_scale=True, ordinals=False):
     """
     This function extracts a number from a text string,
     handles pronunciations in long scale and short scale
@@ -131,13 +217,14 @@ def extractnumber_en(text, short_scale=True):
     Args:
         text (str): the string to normalize
         short_scale (bool): use short scale if True, long scale if False
+        ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
     Returns:
         (int) or (float) or False: The extracted number or False if no number
                                    was found
 
     """
-    string_num_en = {"first": 1,
-                     "second": 2,
+
+    string_num_en = {
                      "half": 0.5,
                      "halves": 0.5,
                      "hundreds": 100,
@@ -148,6 +235,17 @@ def extractnumber_en(text, short_scale=True):
         num_string = NUM_STRING_EN[num]
         string_num_en[num_string] = num
 
+    # first, second...
+    if ordinals:
+        if short_scale:
+            for num in SHORT_ORDINAL_STRING_EN:
+                num_string = SHORT_ORDINAL_STRING_EN[num]
+                string_num_en[num_string] = num
+        else:
+            for num in LONG_ORDINAL_STRING_EN:
+                num_string = LONG_ORDINAL_STRING_EN[num]
+                string_num_en[num_string] = num
+
     # negate next number (-2 = 0 - 2)
     negatives = ["negative", "minus"]
 
@@ -196,16 +294,17 @@ def extractnumber_en(text, short_scale=True):
     for c in decimal_marker:
         components = text.split(c)
         if len(components) == 2:
-            if extractnumber_en(components[0]) is not None \
-                    and extractnumber_en(components[1]):
-                return extractnumber_en(components[0]) + float(
-                    "0." + str(extractnumber_en(components[1])).split(".")[0])
+            number = extractnumber_en(components[0])
+            decimal = extractnumber_en(components[1])
+            if number is not None and decimal is not None:
+                # TODO handle number dot number number number
+                if "." not in str(decimal):
+                    return number + float("0." + str(decimal))
 
     aWords = text.split()
     aWords = [word for word in aWords if word not in ["the", "a", "an"]]
     val = False
     prev_val = None
-    negative = False
     to_sum = []
     for idx, word in enumerate(aWords):
 
@@ -225,8 +324,9 @@ def extractnumber_en(text, short_scale=True):
 
         # is the prev word a number and should we sum it?
         # twenty two, fifty six
-        if prev_word in sums:
-            val = prev_val + val
+        if prev_word in sums and word in string_num_en:
+            if val and val < 10:
+                val = prev_val + val
 
         # is the prev word a number and should we multiply it?
         # twenty hundred, six hundred
@@ -238,18 +338,19 @@ def extractnumber_en(text, short_scale=True):
         # is this a spoken fraction?
         # half cup
         if val is False:
-            val = isFractional_en(word)
+            val = isFractional_en(word, short_scale=short_scale)
 
         # 2 fifths
-        next_value = isFractional_en(next_word)
-        if next_value:
-            if not val:
-                val = 1
-            val = val * next_value
+        if not ordinals:
+            next_value = isFractional_en(next_word, short_scale=short_scale)
+            if next_value:
+                if not val:
+                    val = 1
+                val = val * next_value
 
         # is this a negative number?
         if val and prev_word and prev_word in negatives:
-            negative = True
+            val = 0 - val
 
         # let's make sure it isn't a fraction
         if not val:
@@ -260,7 +361,6 @@ def extractnumber_en(text, short_scale=True):
 
         else:
             prev_val = val
-
             # handle long numbers
             # six hundred sixty six
             # two million five hundred thousand
@@ -272,8 +372,6 @@ def extractnumber_en(text, short_scale=True):
     if val is not None:
         for v in to_sum:
             val = val + v
-    if negative:
-        val = 0 - val
     return val
 
 
@@ -899,12 +997,13 @@ def extract_datetime_en(string, currentDate=None):
     return [extractedDate, resultStr]
 
 
-def isFractional_en(input_str):
+def isFractional_en(input_str, short_scale=True):
     """
     This function takes the given text and checks if it is a fraction.
 
     Args:
         input_str (str): the string to check if fractional
+        short_scale (bool): use short scale if True, long scale if False
     Returns:
         (bool) or (float): False if not a fraction, otherwise the fraction
 
@@ -912,14 +1011,18 @@ def isFractional_en(input_str):
     if input_str.endswith('s', -1):
         input_str = input_str[:len(input_str) - 1]  # e.g. "fifths"
 
-    aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth",
-             "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"]
-
-    if input_str.lower() in aFrac:
-        return 1.0 / (aFrac.index(input_str) + 1)
-    if input_str == "quarter":
-        return 1.0 / 4
+    fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4}
+    if short_scale:
+        for num in SHORT_ORDINAL_STRING_EN:
+            if num > 2:
+                fracts[SHORT_ORDINAL_STRING_EN[num]] = num
+    else:
+        for num in LONG_ORDINAL_STRING_EN:
+            if num > 2:
+                fracts[LONG_ORDINAL_STRING_EN[num]] = num
 
+    if input_str.lower() in fracts:
+        return 1.0 / fracts[input_str.lower()]
     return False
 
 
diff --git a/mycroft/util/parse.py b/mycroft/util/parse.py
index 8620b680a6..31fa7c849f 100644
--- a/mycroft/util/parse.py
+++ b/mycroft/util/parse.py
@@ -70,21 +70,21 @@ def match_one(query, choices):
 
 
 # TODO:18.08
-def extractnumber(text, short_scale=True, lang="en-us"):
+def extractnumber(text, short_scale=True, ordinals=False, lang="en-us"):
     """ Depreciated, replaced by extract_number. Will be removed
     in the 18.08b release.
 
     """
-    return extract_number(text, short_scale, lang)
+    return extract_number(text, short_scale, ordinals, lang)
 
 
-def extract_number(text, short_scale=True, lang="en-us"):
+def extract_number(text, short_scale=True, ordinals=False, lang="en-us"):
     """Takes in a string and extracts a number.
     Args:
         text (str): the string to extract a number from
         short_scale (bool): use short or long scale. See
             https://en.wikipedia.org/wiki/Names_of_large_numbers
-
+        ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
         lang (str): the code for the language text is in
     Returns:
         (int, float or False): The number extracted or False if the input
@@ -93,7 +93,8 @@ def extract_number(text, short_scale=True, lang="en-us"):
 
     lang_lower = str(lang).lower()
     if lang_lower.startswith("en"):
-        return extractnumber_en(text, short_scale)
+        return extractnumber_en(text, short_scale=short_scale,
+                                ordinals=ordinals)
     elif lang_lower.startswith("pt"):
         return extractnumber_pt(text)
     elif lang_lower.startswith("it"):
diff --git a/test/unittests/util/test_parse.py b/test/unittests/util/test_parse.py
index 022baf7599..8e80133db7 100644
--- a/test/unittests/util/test_parse.py
+++ b/test/unittests/util/test_parse.py
@@ -60,10 +60,14 @@ class TestNormalize(unittest.TestCase):
                          "this is an extra test")
 
     def test_extractnumber(self):
-        self.assertEqual(extractnumber("this is the first test"), 1)
+        self.assertEqual(extractnumber("this is the first test",
+                                       ordinals=True), 1)
         self.assertEqual(extractnumber("this is 2 test"), 2)
-        self.assertEqual(extractnumber("this is second test"), 2)
+        self.assertEqual(extractnumber("this is second test",
+                                       ordinals=True), 2)
         self.assertEqual(extractnumber("this is the third test"), 1.0 / 3.0)
+        self.assertEqual(extractnumber("this is the third test",
+                                       ordinals=True), 3.0)
         self.assertEqual(extractnumber("this is test number 4"), 4)
         self.assertEqual(extractnumber("one third of a cup"), 1.0 / 3.0)
         self.assertEqual(extractnumber("three cups"), 3)
@@ -102,6 +106,24 @@ class TestNormalize(unittest.TestCase):
         self.assertEqual(extractnumber("minus 2"), -2)
         self.assertEqual(extractnumber("negative seventy"), -70)
         self.assertEqual(extractnumber("thousand million"), 1000000000)
+        self.assertEqual(extractnumber("sixth third"),
+                         1 / 6 / 3)
+        self.assertEqual(extractnumber("sixth third", ordinals=True),
+                         3)
+        self.assertEqual(extractnumber("thirty second"), 30)
+        self.assertEqual(extractnumber("thirty second", ordinals=True), 32)
+        self.assertEqual(extractnumber("this is the billionth test",
+                                       ordinals=True), 1e09)
+        self.assertEqual(extractnumber("this is the billionth test"), 1e-9)
+        self.assertEqual(extractnumber("this is the billionth test",
+                                       ordinals=True,
+                                       short_scale=False), 1e12)
+        self.assertEqual(extractnumber("this is the billionth test",
+                                       short_scale=False), 1e-12)
+        # TODO handle this case
+        # self.assertEqual(
+        #    extractnumber("6 dot six six six"),
+        #    6.666)
         self.assertTrue(extractnumber("The tennis player is fast") is False)
         self.assertTrue(extractnumber("fraggle") is False)