Fix decimal and fraction parsing.
This updates the _extract_fraction and _extract_decimal functions to handle the new token format.pull/1977/head
parent
48214ca66a
commit
71836b61ec
|
@ -114,6 +114,35 @@ _STRING_SHORT_ORDINAL_EN = _invert_dict(SHORT_ORDINAL_STRING_EN)
|
||||||
_STRING_LONG_ORDINAL_EN = _invert_dict(LONG_ORDINAL_STRING_EN)
|
_STRING_LONG_ORDINAL_EN = _invert_dict(LONG_ORDINAL_STRING_EN)
|
||||||
|
|
||||||
|
|
||||||
|
def _partition_list(items, split_on):
|
||||||
|
"""
|
||||||
|
Partition a list of items.
|
||||||
|
|
||||||
|
Works similarly to str.partition
|
||||||
|
|
||||||
|
Args:
|
||||||
|
items:
|
||||||
|
split_on callable:
|
||||||
|
Should return a boolean. Each item will be passed to
|
||||||
|
this callable in succession, and partitions will be
|
||||||
|
created any time it returns True.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
splits = []
|
||||||
|
current_split = []
|
||||||
|
for item in items:
|
||||||
|
if split_on(item):
|
||||||
|
splits.append(current_split)
|
||||||
|
splits.append([item])
|
||||||
|
current_split = []
|
||||||
|
else:
|
||||||
|
current_split.append(item)
|
||||||
|
splits.append(current_split)
|
||||||
|
return list(filter(lambda x: len(x) != 0, splits))
|
||||||
|
|
||||||
|
|
||||||
def _extract_fraction(tokens):
|
def _extract_fraction(tokens):
|
||||||
"""
|
"""
|
||||||
Extract fraction numbers from a string.
|
Extract fraction numbers from a string.
|
||||||
|
@ -131,20 +160,21 @@ def _extract_fraction(tokens):
|
||||||
tokens [_Token]: words and their indexes in the original string.
|
tokens [_Token]: words and their indexes in the original string.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
int or float
|
(int or float, [_Token])
|
||||||
None if no fraction value is found.
|
The value found, and the list of relevant tokens.
|
||||||
|
(None, None) if no fraction value is found.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if len(tokens) != 3 or tokens[1].word not in _FRACTION_MARKER:
|
for c in _FRACTION_MARKER:
|
||||||
return None, None
|
partitions = _partition_list(tokens, lambda t: t.word == c)
|
||||||
|
|
||||||
|
if len(partitions) == 3:
|
||||||
# ensure first is not a fraction and second is a fraction
|
# ensure first is not a fraction and second is a fraction
|
||||||
num1, words1 = _extract_number_with_text_en(tokens[0])
|
num1, tokens1 = _extract_number_with_text_en(partitions[0])
|
||||||
num2, words2 = _extract_number_with_text_en(tokens[2])
|
num2, tokens2 = _extract_number_with_text_en(partitions[2])
|
||||||
if num1 is not None and num2 is not None \
|
if num1 is not None and num2 is not None \
|
||||||
and num1 >= 1 and 0 < num2 < 1:
|
and num1 >= 1 and 0 < num2 < 1:
|
||||||
return num1 + num2, tokens
|
return num1 + num2, tokens1 + partitions[1] + tokens2
|
||||||
else:
|
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
@ -164,31 +194,25 @@ def _extract_decimal(tokens):
|
||||||
This does not currently handle things like:
|
This does not currently handle things like:
|
||||||
number dot number number number
|
number dot number number number
|
||||||
|
|
||||||
Args:
|
|
||||||
text str: The text to parse.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
int or float
|
|
||||||
None if no decimal value is found.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
tokens [_Token]: The text to parse.
|
tokens [_Token]: The text to parse.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
float
|
(float, [_Token])
|
||||||
None if no decimal value is found.
|
The value found and relevant tokens.
|
||||||
|
(None, None) if no decimal value is found.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if len(tokens) != 3 or tokens[1].word not in _DECIMAL_MARKER:
|
for c in _DECIMAL_MARKER:
|
||||||
return None, None
|
partitions = _partition_list(tokens, lambda t: t.word == c)
|
||||||
|
if len(partitions) == 3:
|
||||||
number, number_text = _extract_number_with_text_en(tokens[0])
|
number, tokens1 = _extract_number_with_text_en(partitions[0])
|
||||||
decimal, decimal_text = _extract_number_with_text_en(tokens[2])
|
decimal, tokens2 = _extract_number_with_text_en(partitions[2])
|
||||||
if number is not None and decimal is not None:
|
if number is not None and decimal is not None:
|
||||||
# TODO handle number dot number number number
|
# TODO handle number dot number number number
|
||||||
if "." not in str(decimal):
|
if "." not in str(decimal):
|
||||||
return number + float("0." + str(decimal)), tokens
|
return number + float("0." + str(decimal)), \
|
||||||
|
tokens1 + partitions[1] + tokens2
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue