parent
7049e65cbe
commit
351381bca2
|
@ -161,9 +161,8 @@ class _ReplaceableNumber():
|
||||||
return "({v}, {t})".format(v=self.value, t=self.tokens)
|
return "({v}, {t})".format(v=self.value, t=self.tokens)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "{n}({v}, {t})".format(n=self.__class__.__name__,
|
return "{n}({v}, {t})".format(n=self.__class__.__name__, v=self.value,
|
||||||
v=self.value,
|
t=self.tokens)
|
||||||
t=self.tokens)
|
|
||||||
|
|
||||||
|
|
||||||
def _tokenize(text):
|
def _tokenize(text):
|
||||||
|
@ -225,23 +224,28 @@ def convert_words_to_numbers(text, short_scale=True, ordinals=False):
|
||||||
"""
|
"""
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
tokens = _tokenize(text)
|
tokens = _tokenize(text)
|
||||||
numbers_to_replace = _extract_numbers_with_text(tokens, short_scale, ordinals)
|
numbers_to_replace = \
|
||||||
|
_extract_numbers_with_text(tokens, short_scale, ordinals)
|
||||||
numbers_to_replace.sort(key=lambda number: number.start_index)
|
numbers_to_replace.sort(key=lambda number: number.start_index)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
if not numbers_to_replace or token.index < numbers_to_replace[0].start_index:
|
if not numbers_to_replace or \
|
||||||
|
token.index < numbers_to_replace[0].start_index:
|
||||||
results.append(token.word)
|
results.append(token.word)
|
||||||
else:
|
else:
|
||||||
if numbers_to_replace and token.index == numbers_to_replace[0].start_index:
|
if numbers_to_replace and \
|
||||||
|
token.index == numbers_to_replace[0].start_index:
|
||||||
results.append(str(numbers_to_replace[0].value))
|
results.append(str(numbers_to_replace[0].value))
|
||||||
if numbers_to_replace and token.index == numbers_to_replace[0].end_index:
|
if numbers_to_replace and \
|
||||||
|
token.index == numbers_to_replace[0].end_index:
|
||||||
numbers_to_replace.pop(0)
|
numbers_to_replace.pop(0)
|
||||||
|
|
||||||
return ' '.join(results)
|
return ' '.join(results)
|
||||||
|
|
||||||
|
|
||||||
def _extract_numbers_with_text(tokens, short_scale=True, ordinals=False, fractional_numbers=True):
|
def _extract_numbers_with_text(tokens, short_scale=True,
|
||||||
|
ordinals=False, fractional_numbers=True):
|
||||||
"""
|
"""
|
||||||
Extract all numbers from a list of _Tokens, with the words that
|
Extract all numbers from a list of _Tokens, with the words that
|
||||||
represent them.
|
represent them.
|
||||||
|
@ -264,20 +268,26 @@ def _extract_numbers_with_text(tokens, short_scale=True, ordinals=False, fractio
|
||||||
results = []
|
results = []
|
||||||
while True:
|
while True:
|
||||||
to_replace = \
|
to_replace = \
|
||||||
_extract_number_with_text_en(tokens, short_scale, ordinals, fractional_numbers)
|
_extract_number_with_text_en(tokens, short_scale,
|
||||||
|
ordinals, fractional_numbers)
|
||||||
|
|
||||||
if not to_replace:
|
if not to_replace:
|
||||||
break
|
break
|
||||||
|
|
||||||
results.append(to_replace)
|
results.append(to_replace)
|
||||||
|
|
||||||
tokens = [t if not to_replace.start_index <= t.index <= to_replace.end_index else \
|
tokens = [
|
||||||
_Token(placeholder, t.index) for t in tokens]
|
t if not
|
||||||
|
to_replace.start_index <= t.index <= to_replace.end_index
|
||||||
|
else
|
||||||
|
_Token(placeholder, t.index) for t in tokens
|
||||||
|
]
|
||||||
results.sort(key=lambda n: n.start_index)
|
results.sort(key=lambda n: n.start_index)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def _extract_number_with_text_en(tokens, short_scale=True, ordinals=False, fractional_numbers=True):
|
def _extract_number_with_text_en(tokens, short_scale=True,
|
||||||
|
ordinals=False, fractional_numbers=True):
|
||||||
"""
|
"""
|
||||||
This function extracts a number from a list of _Tokens.
|
This function extracts a number from a list of _Tokens.
|
||||||
|
|
||||||
|
@ -291,13 +301,17 @@ def _extract_number_with_text_en(tokens, short_scale=True, ordinals=False, fract
|
||||||
_ReplaceableNumber
|
_ReplaceableNumber
|
||||||
|
|
||||||
"""
|
"""
|
||||||
number, tokens = _extract_number_with_text_en_helper(tokens, short_scale, ordinals, fractional_numbers)
|
number, tokens = \
|
||||||
|
_extract_number_with_text_en_helper(tokens, short_scale,
|
||||||
|
ordinals, fractional_numbers)
|
||||||
while tokens and tokens[0].word in ARTICLES:
|
while tokens and tokens[0].word in ARTICLES:
|
||||||
tokens.pop(0)
|
tokens.pop(0)
|
||||||
return _ReplaceableNumber(number, tokens)
|
return _ReplaceableNumber(number, tokens)
|
||||||
|
|
||||||
|
|
||||||
def _extract_number_with_text_en_helper(tokens, short_scale=True, ordinals=False, fractional_numbers=True):
|
def _extract_number_with_text_en_helper(tokens,
|
||||||
|
short_scale=True, ordinals=False,
|
||||||
|
fractional_numbers=True):
|
||||||
"""
|
"""
|
||||||
Helber for _extract_number_with_text_en.
|
Helber for _extract_number_with_text_en.
|
||||||
|
|
||||||
|
@ -312,11 +326,13 @@ def _extract_number_with_text_en_helper(tokens, short_scale=True, ordinals=False
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if fractional_numbers:
|
if fractional_numbers:
|
||||||
fraction, fraction_text = _extract_fraction_with_text_en(tokens, short_scale, ordinals)
|
fraction, fraction_text = \
|
||||||
|
_extract_fraction_with_text_en(tokens, short_scale, ordinals)
|
||||||
if fraction:
|
if fraction:
|
||||||
return fraction, fraction_text
|
return fraction, fraction_text
|
||||||
|
|
||||||
decimal, decimal_text = _extract_decimal_with_text_en(tokens, short_scale, ordinals)
|
decimal, decimal_text = \
|
||||||
|
_extract_decimal_with_text_en(tokens, short_scale, ordinals)
|
||||||
if decimal:
|
if decimal:
|
||||||
return decimal, decimal_text
|
return decimal, decimal_text
|
||||||
|
|
||||||
|
@ -345,8 +361,12 @@ def _extract_fraction_with_text_en(tokens, short_scale, ordinals):
|
||||||
partitions = _partition_list(tokens, lambda t: t.word == c)
|
partitions = _partition_list(tokens, lambda t: t.word == c)
|
||||||
|
|
||||||
if len(partitions) == 3:
|
if len(partitions) == 3:
|
||||||
numbers1 = _extract_numbers_with_text(partitions[0], short_scale, ordinals, fractional_numbers=False)
|
numbers1 = \
|
||||||
numbers2 = _extract_numbers_with_text(partitions[2], short_scale, ordinals, fractional_numbers=True)
|
_extract_numbers_with_text(partitions[0], short_scale,
|
||||||
|
ordinals, fractional_numbers=False)
|
||||||
|
numbers2 = \
|
||||||
|
_extract_numbers_with_text(partitions[2], short_scale,
|
||||||
|
ordinals, fractional_numbers=True)
|
||||||
|
|
||||||
if not numbers1 or not numbers2:
|
if not numbers1 or not numbers2:
|
||||||
return None, None
|
return None, None
|
||||||
|
@ -389,14 +409,18 @@ def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
|
||||||
partitions = _partition_list(tokens, lambda t: t.word == c)
|
partitions = _partition_list(tokens, lambda t: t.word == c)
|
||||||
|
|
||||||
if len(partitions) == 3:
|
if len(partitions) == 3:
|
||||||
numbers1 = _extract_numbers_with_text(partitions[0], short_scale, ordinals, fractional_numbers=False)
|
numbers1 = \
|
||||||
numbers2 = _extract_numbers_with_text(partitions[2], short_scale, ordinals, fractional_numbers=False)
|
_extract_numbers_with_text(partitions[0], short_scale,
|
||||||
|
ordinals, fractional_numbers=False)
|
||||||
|
numbers2 = \
|
||||||
|
_extract_numbers_with_text(partitions[2], short_scale,
|
||||||
|
ordinals, fractional_numbers=False)
|
||||||
|
|
||||||
if not numbers1 or not numbers2:
|
if not numbers1 or not numbers2:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
number = numbers1[-1]# type: _ReplaceableNumber
|
number = numbers1[-1]
|
||||||
decimal = numbers2[0] # type: _ReplaceableNumber
|
decimal = numbers2[0]
|
||||||
|
|
||||||
# TODO handle number dot number number number
|
# TODO handle number dot number number number
|
||||||
if "." not in str(decimal.text):
|
if "." not in str(decimal.text):
|
||||||
|
@ -599,7 +623,8 @@ def extractnumber_en(text, short_scale=True, ordinals=False):
|
||||||
was found
|
was found
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return _extract_number_with_text_en(_tokenize(text), short_scale, ordinals).value
|
return _extract_number_with_text_en(_tokenize(text),
|
||||||
|
short_scale, ordinals).value
|
||||||
|
|
||||||
|
|
||||||
def extract_duration_en(text):
|
def extract_duration_en(text):
|
||||||
|
@ -1439,7 +1464,8 @@ def extract_numbers_en(text, short_scale=True, ordinals=False):
|
||||||
Returns:
|
Returns:
|
||||||
list: list of extracted numbers as floats
|
list: list of extracted numbers as floats
|
||||||
"""
|
"""
|
||||||
results = _extract_numbers_with_text(_tokenize(text), short_scale, ordinals)
|
results = _extract_numbers_with_text(_tokenize(text),
|
||||||
|
short_scale, ordinals)
|
||||||
return [float(result.value) for result in results]
|
return [float(result.value) for result in results]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -141,7 +141,6 @@ class TestNormalize(unittest.TestCase):
|
||||||
self.assertEqual(extract_number("a couple hundred beers"), 200)
|
self.assertEqual(extract_number("a couple hundred beers"), 200)
|
||||||
self.assertEqual(extract_number("a couple thousand beers"), 2000)
|
self.assertEqual(extract_number("a couple thousand beers"), 2000)
|
||||||
|
|
||||||
|
|
||||||
def test_extract_duration_en(self):
|
def test_extract_duration_en(self):
|
||||||
self.assertEqual(extract_duration("10 seconds"), (10.0, ""))
|
self.assertEqual(extract_duration("10 seconds"), (10.0, ""))
|
||||||
self.assertEqual(extract_duration("5 minutes"), (300.0, ""))
|
self.assertEqual(extract_duration("5 minutes"), (300.0, ""))
|
||||||
|
@ -150,12 +149,21 @@ class TestNormalize(unittest.TestCase):
|
||||||
self.assertEqual(extract_duration("25 weeks"), (15120000.0, ""))
|
self.assertEqual(extract_duration("25 weeks"), (15120000.0, ""))
|
||||||
self.assertEqual(extract_duration("seven hours"), (25200.0, ""))
|
self.assertEqual(extract_duration("seven hours"), (25200.0, ""))
|
||||||
self.assertEqual(extract_duration("7.5 seconds"), (7.5, ""))
|
self.assertEqual(extract_duration("7.5 seconds"), (7.5, ""))
|
||||||
self.assertEqual(extract_duration("eight and a half days thirty nine seconds"), (734439.0, ""))
|
self.assertEqual(extract_duration("eight and a half days thirty"
|
||||||
self.assertEqual(extract_duration("Set a timer for 30 minutes"), (1800.0, "set a timer for"))
|
" nine seconds"), (734439.0, ""))
|
||||||
self.assertEqual(extract_duration("Four and a half minutes until sunset"), (270.0, "until sunset"))
|
self.assertEqual(extract_duration("Set a timer for 30 minutes"),
|
||||||
self.assertEqual(extract_duration("Nineteen minutes past the hour"), (1140.0, "past the hour"))
|
(1800.0, "set a timer for"))
|
||||||
self.assertEqual(extract_duration("wake me up in three weeks, four hundred ninety seven days, and three hundred 91.6 seconds"), (44755591.6, "wake me up in , , and"))
|
self.assertEqual(extract_duration("Four and a half minutes until"
|
||||||
self.assertEqual(extract_duration("The movie is one hour, fifty seven and a half minutes long"), (7050.0, "the movie is , long"))
|
" sunset"), (270.0, "until sunset"))
|
||||||
|
self.assertEqual(extract_duration("Nineteen minutes past the hour"),
|
||||||
|
(1140.0, "past the hour"))
|
||||||
|
self.assertEqual(extract_duration("wake me up in three weeks, four"
|
||||||
|
" hundred ninety seven days, and"
|
||||||
|
" three hundred 91.6 seconds"),
|
||||||
|
(44755591.6, "wake me up in , , and"))
|
||||||
|
self.assertEqual(extract_duration("The movie is one hour, fifty seven"
|
||||||
|
" and a half minutes long"),
|
||||||
|
(7050.0, "the movie is , long"))
|
||||||
|
|
||||||
def test_extractdatetime_en(self):
|
def test_extractdatetime_en(self):
|
||||||
def extractWithFormat(text):
|
def extractWithFormat(text):
|
||||||
|
@ -554,10 +562,10 @@ class TestNormalize(unittest.TestCase):
|
||||||
self.assertEqual(extract_numbers("two pigs and six trillion bacteria",
|
self.assertEqual(extract_numbers("two pigs and six trillion bacteria",
|
||||||
short_scale=False), [2, 6e18])
|
short_scale=False), [2, 6e18])
|
||||||
self.assertEqual(extract_numbers("thirty second or first",
|
self.assertEqual(extract_numbers("thirty second or first",
|
||||||
ordinals=True), [32, 1])
|
ordinals=True), [32, 1])
|
||||||
self.assertEqual(extract_numbers("this is a seven eight nine and a "
|
self.assertEqual(extract_numbers("this is a seven eight nine and a"
|
||||||
"half test"),
|
" half test"),
|
||||||
[7.0, 8.0, 9.5])
|
[7.0, 8.0, 9.5])
|
||||||
|
|
||||||
def test_contractions(self):
|
def test_contractions(self):
|
||||||
self.assertEqual(normalize("ain't"), "is not")
|
self.assertEqual(normalize("ain't"), "is not")
|
||||||
|
|
Loading…
Reference in New Issue