Fix decimal and fraction parsing.

This updates the _extract_fraction and _extract_decimal functions to
handle the new token format.
pull/1977/head
Chris Rogers 2019-01-30 18:03:25 -05:00
parent 48214ca66a
commit 71836b61ec
1 changed files with 57 additions and 33 deletions

View File

@ -97,10 +97,10 @@ _MULTIPLIES_SHORT_SCALE_EN = set(SHORT_SCALE_EN.values()) |\
# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 )
_FRACTION_MARKER = {" and "}
_FRACTION_MARKER = {"and"}
# decimal marker ( 1 point 5 = 1 + 0.5)
_DECIMAL_MARKER = {" point ", " dot "}
_DECIMAL_MARKER = {"point", "dot"}
_STRING_NUM_EN = _invert_dict(NUM_STRING_EN)
_STRING_NUM_EN.update(_generate_plurals(_STRING_NUM_EN))
@ -114,6 +114,35 @@ _STRING_SHORT_ORDINAL_EN = _invert_dict(SHORT_ORDINAL_STRING_EN)
_STRING_LONG_ORDINAL_EN = _invert_dict(LONG_ORDINAL_STRING_EN)
def _partition_list(items, split_on):
"""
Partition a list of items.
Works similarly to str.partition
Args:
items:
split_on callable:
Should return a boolean. Each item will be passed to
this callable in succession, and partitions will be
created any time it returns True.
Returns:
"""
splits = []
current_split = []
for item in items:
if split_on(item):
splits.append(current_split)
splits.append([item])
current_split = []
else:
current_split.append(item)
splits.append(current_split)
return list(filter(lambda x: len(x) != 0, splits))
def _extract_fraction(tokens):
"""
Extract fraction numbers from a string.
@ -131,21 +160,22 @@ def _extract_fraction(tokens):
tokens [_Token]: words and their indexes in the original string.
Returns:
int or float
None if no fraction value is found.
(int or float, [_Token])
The value found, and the list of relevant tokens.
(None, None) if no fraction value is found.
"""
if len(tokens) != 3 or tokens[1].word not in _FRACTION_MARKER:
return None, None
for c in _FRACTION_MARKER:
partitions = _partition_list(tokens, lambda t: t.word == c)
# ensure first is not a fraction and second is a fraction
num1, words1 = _extract_number_with_text_en(tokens[0])
num2, words2 = _extract_number_with_text_en(tokens[2])
if num1 is not None and num2 is not None \
and num1 >= 1 and 0 < num2 < 1:
return num1 + num2, tokens
else:
return None, None
if len(partitions) == 3:
# ensure first is not a fraction and second is a fraction
num1, tokens1 = _extract_number_with_text_en(partitions[0])
num2, tokens2 = _extract_number_with_text_en(partitions[2])
if num1 is not None and num2 is not None \
and num1 >= 1 and 0 < num2 < 1:
return num1 + num2, tokens1 + partitions[1] + tokens2
return None, None
def _extract_decimal(tokens):
@ -164,31 +194,25 @@ def _extract_decimal(tokens):
This does not currently handle things like:
number dot number number number
Args:
text str: The text to parse.
Returns:
int or float
None if no decimal value is found.
Args:
tokens [_Token]: The text to parse.
Returns:
float
None if no decimal value is found.
(float, [_Token])
The value found and relevant tokens.
(None, None) if no decimal value is found.
"""
if len(tokens) != 3 or tokens[1].word not in _DECIMAL_MARKER:
return None, None
number, number_text = _extract_number_with_text_en(tokens[0])
decimal, decimal_text = _extract_number_with_text_en(tokens[2])
if number is not None and decimal is not None:
# TODO handle number dot number number number
if "." not in str(decimal):
return number + float("0." + str(decimal)), tokens
for c in _DECIMAL_MARKER:
partitions = _partition_list(tokens, lambda t: t.word == c)
if len(partitions) == 3:
number, tokens1 = _extract_number_with_text_en(partitions[0])
decimal, tokens2 = _extract_number_with_text_en(partitions[2])
if number is not None and decimal is not None:
# TODO handle number dot number number number
if "." not in str(decimal):
return number + float("0." + str(decimal)), \
tokens1 + partitions[1] + tokens2
return None, None