Merge pull request #1653 from JarbasAl/feature/pronounce_scientific

Feature/pronounce scientific
2018-07-31 13:04:14 -05:00 · 2018-07-31 13:04:14 -05:00 · 7b54149bcd
parent 040b64d09d 57a86a7fe6
commit 7b54149bcd
5 changed files with 260 additions and 121 deletions
--- a/mycroft/util/format.py
+++ b/mycroft/util/format.py
@ -254,7 +254,8 @@ def nice_time(dt, lang="en-us", speech=True, use_24hour=False,
    return str(dt)


-def pronounce_number(number, lang="en-us", places=2):
+def pronounce_number(number, lang="en-us", places=2, short_scale=True,
+                     scientific=False):
    """
    Convert a number to it's spoken equivalent

@ -262,12 +263,17 @@ def pronounce_number(number, lang="en-us", places=2):

    Args:
        number: the number to pronounce
+        short_scale (bool) : use short (True) or long scale (False)
+            https://en.wikipedia.org/wiki/Names_of_large_numbers
+        scientific (bool) : convert and pronounce in scientific notation
    Returns:
        (str): The pronounced number
    """
    lang_lower = str(lang).lower()
    if lang_lower.startswith("en"):
-        return pronounce_number_en(number, places=places)
+        return pronounce_number_en(number, places=places,
+                                   short_scale=short_scale,
+                                   scientific=scientific)
    elif lang_lower.startswith("it"):
        return pronounce_number_it(number, places=places)
    elif lang_lower.startswith("fr"):
--- a/mycroft/util/lang/format_en.py
+++ b/mycroft/util/lang/format_en.py
@ -16,6 +16,8 @@
 #

 from mycroft.util.lang.format_common import convert_to_mixed_fraction
+import collections
+

 NUM_STRING_EN = {
    0: 'zero',
@ -45,10 +47,7 @@ NUM_STRING_EN = {
    60: 'sixty',
    70: 'seventy',
    80: 'eighty',
-    90: 'ninety',
-    100: 'hundred',
-    1000: 'thousand',
-    1000000: 'million'
+    90: 'ninety'
 }

 FRACTION_STRING_EN = {
@ -73,6 +72,109 @@ FRACTION_STRING_EN = {
    20: 'twentyith'
 }

+LONG_SCALE_EN = collections.OrderedDict([
+    (100, 'hundred'),
+    (1000, 'thousand'),
+    (1000000, 'million'),
+    (1e12, "billion"),
+    (1e18, 'trillion'),
+    (1e24, "quadrillion"),
+    (1e30, "quintillion"),
+    (1e36, "sextillion"),
+    (1e42, "septillion"),
+    (1e48, "octillion"),
+    (1e54, "nonillion"),
+    (1e60, "decillion"),
+    (1e66, "undecillion"),
+    (1e72, "duodecillion"),
+    (1e78, "tredecillion"),
+    (1e84, "quattuordecillion"),
+    (1e90, "quinquadecillion"),
+    (1e96, "sedecillion"),
+    (1e102, "septendecillion"),
+    (1e108, "octodecillion"),
+    (1e114, "novendecillion"),
+    (1e120, "vigintillion"),
+    (1e306, "unquinquagintillion"),
+    (1e312, "duoquinquagintillion"),
+    (1e336, "sesquinquagintillion"),
+    (1e366, "unsexagintillion")
+])
+
+SHORT_SCALE_EN = collections.OrderedDict([
+    (100, 'hundred'),
+    (1000, 'thousand'),
+    (1000000, 'million'),
+    (1e9, "billion"),
+    (1e10, 'trillion'),
+    (1e15, "quadrillion"),
+    (1e18, "quintillion"),
+    (1e21, "sextillion"),
+    (1e24, "septillion"),
+    (1e27, "octillion"),
+    (1e30, "nonillion"),
+    (1e33, "decillion"),
+    (1e36, "undecillion"),
+    (1e39, "duodecillion"),
+    (1e42, "tredecillion"),
+    (1e45, "quattuordecillion"),
+    (1e48, "quinquadecillion"),
+    (1e51, "sedecillion"),
+    (1e54, "septendecillion"),
+    (1e57, "octodecillion"),
+    (1e60, "novendecillion"),
+    (1e63, "vigintillion"),
+    (1e66, "unvigintillion"),
+    (1e69, "uuovigintillion"),
+    (1e72, "tresvigintillion"),
+    (1e75, "quattuorvigintillion"),
+    (1e78, "quinquavigintillion"),
+    (1e81, "qesvigintillion"),
+    (1e84, "septemvigintillion"),
+    (1e87, "octovigintillion"),
+    (1e90, "novemvigintillion"),
+    (1e93, "trigintillion"),
+    (1e96, "untrigintillion"),
+    (1e99, "duotrigintillion"),
+    (1e102, "trestrigintillion"),
+    (1e105, "quattuortrigintillion"),
+    (1e108, "quinquatrigintillion"),
+    (1e111, "sestrigintillion"),
+    (1e114, "septentrigintillion"),
+    (1e117, "octotrigintillion"),
+    (1e120, "noventrigintillion"),
+    (1e123, "quadragintillion"),
+    (1e153, "quinquagintillion"),
+    (1e183, "sexagintillion"),
+    (1e213, "septuagintillion"),
+    (1e243, "octogintillion"),
+    (1e273, "nonagintillion"),
+    (1e303, "centillion"),
+    (1e306, "uncentillion"),
+    (1e309, "duocentillion"),
+    (1e312, "trescentillion"),
+    (1e333, "decicentillion"),
+    (1e336, "undecicentillion"),
+    (1e363, "viginticentillion"),
+    (1e366, "unviginticentillion"),
+    (1e393, "trigintacentillion"),
+    (1e423, "quadragintacentillion"),
+    (1e453, "quinquagintacentillion"),
+    (1e483, "sexagintacentillion"),
+    (1e513, "septuagintacentillion"),
+    (1e543, "ctogintacentillion"),
+    (1e573, "nonagintacentillion"),
+    (1e603, "ducentillion"),
+    (1e903, "trecentillion"),
+    (1e1203, "quadringentillion"),
+    (1e1503, "quingentillion"),
+    (1e1803, "sescentillion"),
+    (1e2103, "septingentillion"),
+    (1e2403, "octingentillion"),
+    (1e2703, "nongentillion"),
+    (1e3003, "millinillion")
+])
+

 def nice_number_en(number, speech, denominators):
    """ English helper for nice_number
@ -119,7 +221,7 @@ def nice_number_en(number, speech, denominators):
    return return_string


-def pronounce_number_en(num, places=2):
+def pronounce_number_en(num, places=2, short_scale=True, scientific=False):
    """
    Convert a number to it's spoken equivalent

@ -128,32 +230,114 @@ def pronounce_number_en(num, places=2):
    Args:
        num(float or int): the number to pronounce (under 100)
        places(int): maximum decimal places to speak
+        short_scale (bool) : use short (True) or long scale (False)
+            https://en.wikipedia.org/wiki/Names_of_large_numbers
+        scientific (bool): pronounce in scientific notation
    Returns:
        (str): The pronounced number
    """
-    if abs(num) >= 100:
-        # TODO: Support for numbers over 100
-        return str(num)
+    if scientific:
+        number = '%E' % num
+        n, power = number.replace("+", "").split("E")
+        power = int(power)
+        if power != 0:
+            return pronounce_number_en(float(n), places, short_scale, False) \
+                   + " times ten to the power of " + \
+                   pronounce_number_en(power, places, short_scale, False)
+    if short_scale:
+        number_names = NUM_STRING_EN.copy()
+        number_names.update(SHORT_SCALE_EN)
+    else:
+        number_names = NUM_STRING_EN.copy()
+        number_names.update(LONG_SCALE_EN)

+    digits = [number_names[n] for n in range(0, 20)]
+
+    tens = [number_names[n] for n in range(10, 100, 10)]
+
+    if short_scale:
+        hundreds = [SHORT_SCALE_EN[n] for n in SHORT_SCALE_EN.keys()]
+    else:
+        hundreds = [LONG_SCALE_EN[n] for n in LONG_SCALE_EN.keys()]
+
+    # deal with negatives
    result = ""
    if num < 0:
        result = "negative "
    num = abs(num)

-    if num > 20:
-        tens = int(num - int(num) % 10)
-        result += NUM_STRING_EN[tens]
-        if int(num - tens) != 0:
-            result += " " + NUM_STRING_EN[int(num - tens)]
+    # check for a direct match
+    if num in number_names:
+        if num > 90:
+            result += "one "
+        result += number_names[num]
    else:
-        result += NUM_STRING_EN[int(num)]
+        def _sub_thousand(n):
+            assert 0 <= n <= 999
+            if n <= 19:
+                return digits[n]
+            elif n <= 99:
+                q, r = divmod(n, 10)
+                return tens[q - 1] + (" " + _sub_thousand(r) if r else "")
+            else:
+                q, r = divmod(n, 100)
+                return digits[q] + " hundred" + (
+                    " and " + _sub_thousand(r) if r else "")
+
+        def _short_scale(n):
+            n = int(n)
+            assert 0 <= n
+            return ", ".join(reversed(
+                [_sub_thousand(z) + (
+                    " " + hundreds[i] if i else "") if z else ""
+                 for i, z in enumerate(_split_by_thousands(n))]))
+
+        def _split_by_thousands(n):
+            assert 0 <= n
+            res = []
+            while n:
+                n, r = divmod(n, 1000)
+                res.append(r)
+            return res
+
+        def _split_by_millions(n):
+            assert 0 <= n
+            res = []
+            while n:
+                n, r = divmod(n, 1000)
+                res.append(r)
+            return res
+
+        def _long_scale(n):
+            if n >= 10e153:
+                return "infinity"
+            n = int(n)
+            assert 0 <= n
+            res = []
+            for i, z in enumerate(_split_by_millions(n)):
+                if not z:
+                    continue
+                number = pronounce_number_en(z, places, True)
+                if i % 2 != 0 and i > 1:
+                    number += " " + "thousand"
+                elif i > 0 and i < 3:
+                    number += " " + hundreds[i] + ","
+                elif i:
+                    number += " " + hundreds[i - 1] + ","
+                res.append(number)
+            return " ".join(reversed(res))
+
+        if short_scale:
+            result += _short_scale(num)
+        else:
+            result += _long_scale(num)

    # Deal with fractional part
    if not num == int(num) and places > 0:
        result += " point"
        place = 10
        while int(num * place) % 10 > 0 and places > 0:
-            result += " " + NUM_STRING_EN[int(num * place) % 10]
+            result += " " + number_names[int(num * place) % 10]
            place *= 10
            places -= 1
    return result
--- a/mycroft/util/lang/parse_en.py
+++ b/mycroft/util/lang/parse_en.py
@ -19,106 +19,8 @@ from datetime import datetime
 from dateutil.relativedelta import relativedelta

 from mycroft.util.lang.parse_common import is_numeric, look_for_fractions
-from mycroft.util.lang.format_en import NUM_STRING_EN
-
-LONG_SCALE_EN = {
-    10e12: "billion",
-    10e18: 'trillion',
-    10e24: "quadrillion",
-    10e30: "quintillion",
-    10e36: "sextillion",
-    10e42: "septillion",
-    10e48: "octillion",
-    10e54: "nonillion",
-    10e60: "decillion",
-    10e66: "undecillion",
-    10e72: "duodecillion",
-    10e78: "tredecillion",
-    10e84: "quattuordecillion",
-    10e90: "quinquadecillion",
-    10e96: "sedecillion",
-    10e102: "septendecillion",
-    10e108: "octodecillion",
-    10e114: "novendecillion",
-    10e120: "vigintillion",
-    10e306: "unquinquagintillion",
-    10e312: "duoquinquagintillion",
-    10e336: "sesquinquagintillion",
-    10e366: "unsexagintillion",
-    10e100: "googol"
-}
-
-SHORT_SCALE_EN = {
-    10e9: "billion",
-    10e10: 'trillion',
-    10e15: "quadrillion",
-    10e18: "quintillion",
-    10e21: "sextillion",
-    10e24: "septillion",
-    10e27: "octillion",
-    10e30: "nonillion",
-    10e33: "decillion",
-    10e36: "undecillion",
-    10e39: "duodecillion",
-    10e42: "tredecillion",
-    10e45: "quattuordecillion",
-    10e48: "quinquadecillion",
-    10e51: "sedecillion",
-    10e54: "septendecillion",
-    10e57: "octodecillion",
-    10e60: "novendecillion",
-    10e63: "vigintillion",
-    10e66: "unvigintillion",
-    10e69: "uuovigintillion",
-    10e72: "tresvigintillion",
-    10e75: "quattuorvigintillion",
-    10e78: "quinquavigintillion",
-    10e81: "qesvigintillion",
-    10e84: "septemvigintillion",
-    10e87: "octovigintillion",
-    10e90: "novemvigintillion",
-    10e93: "trigintillion",
-    10e96: "untrigintillion",
-    10e99: "duotrigintillion",
-    10e102: "trestrigintillion",
-    10e105: "quattuortrigintillion",
-    10e108: "quinquatrigintillion",
-    10e111: "sestrigintillion",
-    10e114: "septentrigintillion",
-    10e117: "octotrigintillion",
-    10e120: "noventrigintillion",
-    10e123: "quadragintillion",
-    10e153: "quinquagintillion",
-    10e183: "sexagintillion",
-    10e213: "septuagintillion",
-    10e243: "octogintillion",
-    10e273: "nonagintillion",
-    10e303: "centillion",
-    10e306: "uncentillion",
-    10e309: "duocentillion",
-    10e312: "trescentillion",
-    10e333: "decicentillion",
-    10e336: "undecicentillion",
-    10e363: "viginticentillion",
-    10e366: "unviginticentillion",
-    10e393: "trigintacentillion",
-    10e423: "quadragintacentillion",
-    10e453: "quinquagintacentillion",
-    10e483: "sexagintacentillion",
-    10e513: "septuagintacentillion",
-    10e543: "ctogintacentillion",
-    10e573: "nonagintacentillion",
-    10e603: "ducentillion",
-    10e903: "trecentillion",
-    10e1203: "quadringentillion",
-    10e1503: "quingentillion",
-    10e1803: "sescentillion",
-    10e2103: "septingentillion",
-    10e2403: "octingentillion",
-    10e2703: "nongentillion",
-    10e3003: "millinillion",
-    10e100: "googol"
-}
+from mycroft.util.lang.format_en import NUM_STRING_EN, LONG_SCALE_EN, \
+    SHORT_SCALE_EN

 SHORT_ORDINAL_STRING_EN = {
    1: 'first',
@ -227,8 +129,11 @@ def extractnumber_en(text, short_scale=True, ordinals=False):
    string_num_en = {
                     "half": 0.5,
                     "halves": 0.5,
+                     "hundred": 100,
                     "hundreds": 100,
+                     "thousand": 1000,
                     "thousands": 1000,
+                     "million": 1000000,
                     'millions': 1000000}

    for num in NUM_STRING_EN:
--- a/test/unittests/util/test_format.py
+++ b/test/unittests/util/test_format.py
@ -143,6 +143,50 @@ class TestPronounceNumber(unittest.TestCase):
        self.assertEqual(pronounce_number(-21.234, places=5),
                         "negative twenty one point two three four")

+    def test_convert_hundreds(self):
+        self.assertEqual(pronounce_number(100), "one hundred")
+        self.assertEqual(pronounce_number(666), "six hundred and sixty six")
+        self.assertEqual(pronounce_number(1456), "one thousand, four hundred "
+                                                 "and fifty six")
+        self.assertEqual(pronounce_number(103254654), "one hundred and three "
+                                                      "million, two hundred "
+                                                      "and fifty four "
+                                                      "thousand, six hundred "
+                                                      "and fifty four")
+        self.assertEqual(pronounce_number(1512457), "one million, five hundred"
+                                                    " and twelve thousand, "
+                                                    "four hundred and fifty "
+                                                    "seven")
+        self.assertEqual(pronounce_number(209996), "two hundred and nine "
+                                                   "thousand, nine hundred "
+                                                   "and ninety six")
+        self.assertEqual(pronounce_number(95505896639631893),
+                         "ninety five quadrillion, five hundred and five "
+                         "trillion, eight hundred and ninety six billion, six "
+                         "hundred and thirty nine million, six hundred and "
+                         "thirty one thousand, eight hundred and ninety three")
+        self.assertEqual(pronounce_number(95505896639631893,
+                                          short_scale=False),
+                         "ninety five thousand five hundred and five billion, "
+                         "eight hundred and ninety six thousand six hundred "
+                         "and thirty nine million, six hundred and thirty one "
+                         "thousand, eight hundred and ninety three")
+
+    def test_convert_scientific_notation(self):
+        self.assertEqual(pronounce_number(0, scientific=True), "zero")
+        self.assertEqual(pronounce_number(33, scientific=True),
+                         "three point three times ten to the power of one")
+        self.assertEqual(pronounce_number(299792458, scientific=True),
+                         "two point nine nine times ten to the power of eight")
+        self.assertEqual(pronounce_number(299792458, places=6,
+                                          scientific=True),
+                         "two point nine nine seven nine two five times "
+                         "ten to the power of eight")
+        self.assertEqual(pronounce_number(1.672e-27, places=3,
+                                          scientific=True),
+                         "one point six seven two times ten to the power of "
+                         "negative twenty seven")
+

 # def nice_time(dt, lang="en-us", speech=True, use_24hour=False,
 #              use_ampm=False):
--- a/test/unittests/util/test_parse.py
+++ b/test/unittests/util/test_parse.py
@ -91,17 +91,17 @@ class TestNormalize(unittest.TestCase):
        self.assertEqual(extractnumber("two million"), 2000000)
        self.assertEqual(extractnumber("two million five hundred thousand "
                                       "tons of spinning metal"), 2500000)
-        self.assertEqual(extractnumber("six trillion"), 600000000000.0)
+        self.assertEqual(extractnumber("six trillion"), 60000000000.0)
        self.assertEqual(extractnumber("six trillion", short_scale=False),
-                         6e+19)
+                         6e+18)
        self.assertEqual(extractnumber("one point five"), 1.5)
        self.assertEqual(extractnumber("three dot fourteen"), 3.14)
        self.assertEqual(extractnumber("zero point two"), 0.2)
        self.assertEqual(extractnumber("billions of years older"),
-                         10000000000.0)
+                         1000000000.0)
        self.assertEqual(extractnumber("billions of years older",
                                       short_scale=False),
-                         10000000000000.0)
+                         1000000000000.0)
        self.assertEqual(extractnumber("one hundred thousand"), 100000)
        self.assertEqual(extractnumber("minus 2"), -2)
        self.assertEqual(extractnumber("negative seventy"), -70)