mycroft-core/mycroft/util/lang/parse_common.py

104 lines
3.4 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright 2017 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
def is_numeric(input_str):
"""
Takes in a string and tests to see if it is a number.
Args:
text (str): string to test if a number
Returns:
(bool): True if a number, else False
"""
try:
float(input_str)
return True
except ValueError:
return False
def look_for_fractions(split_list):
""""
This function takes a list made by fraction & determines if a fraction.
Args:
split_list (list): list created by splitting on '/'
Returns:
(bool): False if not a fraction, otherwise True
"""
if len(split_list) == 2:
if is_numeric(split_list[0]) and is_numeric(split_list[1]):
return True
return False
def extract_numbers_generic(text, pronounce_handler, extract_handler,
short_scale=True, ordinals=False):
"""
Takes in a string and extracts a list of numbers.
Language agnostic, per language parsers need to be provided
Args:
text (str): the string to extract a number from
pronounce_handler (function): function that pronounces a number
extract_handler (function): function that extracts the last number
present in a string
short_scale (bool): Use "short scale" or "long scale" for large
numbers -- over a million. The default is short scale, which
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
Returns:
list: list of extracted numbers as floats
"""
numbers = []
normalized = text
extract = extract_handler(normalized, short_scale, ordinals)
to_parse = normalized
while extract:
numbers.append(extract)
prev = to_parse
num_txt = pronounce_handler(extract)
extract = str(extract)
if extract.endswith(".0"):
extract = extract[:-2]
# handle duplicate occurences, replace last one only
def replace_right(source, target, replacement, replacements=None):
return replacement.join(source.rsplit(target, replacements))
normalized = replace_right(normalized, num_txt, extract, 1)
# last biggest number was replaced, recurse to handle cases like
# test one two 3
to_parse = replace_right(to_parse, num_txt, extract, 1)
to_parse = replace_right(to_parse, extract, " ", 1)
if to_parse == prev:
# avoid infinite loops, occasionally pronounced number may be
# different from extracted text,
# ie pronounce(0.5) != half and extract(half) == 0.5
extract = False
# TODO fix this
else:
extract = extract_handler(to_parse, short_scale, ordinals)
numbers.reverse()
return numbers