2017-12-30 00:14:28 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
2017-02-23 12:40:46 +00:00
|
|
|
#
|
2017-10-04 06:28:44 +00:00
|
|
|
# Copyright 2017 Mycroft AI Inc.
|
2017-02-23 12:40:46 +00:00
|
|
|
#
|
2017-10-04 06:28:44 +00:00
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
2017-02-23 12:40:46 +00:00
|
|
|
#
|
2017-10-04 06:28:44 +00:00
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
2017-02-23 12:40:46 +00:00
|
|
|
#
|
2017-10-04 06:28:44 +00:00
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
2018-07-11 18:33:20 +00:00
|
|
|
from dateutil.tz import tzlocal
|
2017-12-15 11:54:14 +00:00
|
|
|
from difflib import SequenceMatcher
|
2018-07-11 18:33:20 +00:00
|
|
|
import mycroft.util.time
|
2017-12-15 11:54:14 +00:00
|
|
|
|
2017-12-30 00:14:28 +00:00
|
|
|
from mycroft.util.lang.parse_en import *
|
|
|
|
from mycroft.util.lang.parse_pt import *
|
|
|
|
from mycroft.util.lang.parse_es import *
|
2018-01-25 14:58:32 +00:00
|
|
|
from mycroft.util.lang.parse_it import *
|
2018-01-31 15:57:09 +00:00
|
|
|
from mycroft.util.lang.parse_sv import *
|
2018-06-12 06:55:21 +00:00
|
|
|
from mycroft.util.lang.parse_de import extractnumber_de
|
|
|
|
from mycroft.util.lang.parse_de import extract_datetime_de
|
|
|
|
from mycroft.util.lang.parse_de import normalize_de
|
2018-02-12 11:02:54 +00:00
|
|
|
from mycroft.util.lang.parse_fr import extractnumber_fr
|
|
|
|
from mycroft.util.lang.parse_fr import extract_datetime_fr
|
|
|
|
from mycroft.util.lang.parse_fr import normalize_fr
|
|
|
|
|
2017-12-30 00:14:28 +00:00
|
|
|
from mycroft.util.lang.parse_common import *
|
|
|
|
|
2017-12-15 11:54:14 +00:00
|
|
|
|
|
|
|
def fuzzy_match(x, against):
|
|
|
|
"""Perform a 'fuzzy' comparison between two strings.
|
|
|
|
Returns:
|
|
|
|
float: match percentage -- 1.0 for perfect match,
|
|
|
|
down to 0.0 for no match at all.
|
|
|
|
"""
|
|
|
|
return SequenceMatcher(None, x, against).ratio()
|
2017-07-10 20:33:21 +00:00
|
|
|
|
|
|
|
|
2018-02-01 07:52:14 +00:00
|
|
|
def match_one(query, choices):
|
|
|
|
"""
|
|
|
|
Find best match from a list or dictionary given an input
|
|
|
|
|
|
|
|
Arguments:
|
|
|
|
query: string to test
|
|
|
|
choices: list or dictionary of choices
|
|
|
|
|
|
|
|
Returns: tuple with best match, score
|
|
|
|
"""
|
|
|
|
if isinstance(choices, dict):
|
2018-02-08 08:56:19 +00:00
|
|
|
_choices = list(choices.keys())
|
2018-02-01 07:52:14 +00:00
|
|
|
elif isinstance(choices, list):
|
|
|
|
_choices = choices
|
|
|
|
else:
|
|
|
|
raise ValueError('a list or dict of choices must be provided')
|
|
|
|
|
|
|
|
best = (_choices[0], fuzzy_match(query, _choices[0]))
|
|
|
|
for c in _choices[1:]:
|
|
|
|
score = fuzzy_match(query, c)
|
|
|
|
if score > best[1]:
|
|
|
|
best = (c, score)
|
|
|
|
|
|
|
|
if isinstance(choices, dict):
|
|
|
|
return (choices[best[0]], best[1])
|
|
|
|
else:
|
|
|
|
return best
|
|
|
|
|
|
|
|
|
2018-07-02 07:44:28 +00:00
|
|
|
# TODO:18.08
|
2018-07-10 07:54:04 +00:00
|
|
|
def extractnumber(text, short_scale=True, ordinals=False, lang="en-us"):
|
2018-07-02 07:44:28 +00:00
|
|
|
""" Depreciated, replaced by extract_number. Will be removed
|
|
|
|
in the 18.08b release.
|
|
|
|
|
|
|
|
"""
|
2018-07-10 07:54:04 +00:00
|
|
|
return extract_number(text, short_scale, ordinals, lang)
|
2018-07-02 07:44:28 +00:00
|
|
|
|
|
|
|
|
2018-07-10 07:54:04 +00:00
|
|
|
def extract_number(text, short_scale=True, ordinals=False, lang="en-us"):
|
2017-05-25 23:31:43 +00:00
|
|
|
"""Takes in a string and extracts a number.
|
2018-07-11 09:53:47 +00:00
|
|
|
|
2017-05-25 23:31:43 +00:00
|
|
|
Args:
|
|
|
|
text (str): the string to extract a number from
|
2018-07-11 09:53:47 +00:00
|
|
|
short_scale (bool): Use "short scale" or "long scale" for large
|
|
|
|
numbers -- over a million. The default is short scale, which
|
|
|
|
is now common in most English speaking countries.
|
|
|
|
See https://en.wikipedia.org/wiki/Names_of_large_numbers
|
|
|
|
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
|
|
|
|
lang (str): the BCP-47 code for the language to use
|
2017-05-25 23:31:43 +00:00
|
|
|
Returns:
|
2018-07-02 07:44:28 +00:00
|
|
|
(int, float or False): The number extracted or False if the input
|
|
|
|
text contains no numbers
|
2017-05-25 23:31:43 +00:00
|
|
|
"""
|
|
|
|
lang_lower = str(lang).lower()
|
|
|
|
if lang_lower.startswith("en"):
|
2018-07-10 07:54:04 +00:00
|
|
|
return extractnumber_en(text, short_scale=short_scale,
|
|
|
|
ordinals=ordinals)
|
2017-10-26 23:18:00 +00:00
|
|
|
elif lang_lower.startswith("pt"):
|
|
|
|
return extractnumber_pt(text)
|
2018-01-25 14:58:32 +00:00
|
|
|
elif lang_lower.startswith("it"):
|
|
|
|
return extractnumber_it(text)
|
2018-02-08 20:46:24 +00:00
|
|
|
elif lang_lower.startswith("fr"):
|
|
|
|
return extractnumber_fr(text)
|
2018-01-31 15:57:09 +00:00
|
|
|
elif lang_lower.startswith("sv"):
|
|
|
|
return extractnumber_sv(text)
|
2018-06-12 06:55:21 +00:00
|
|
|
elif lang_lower.startswith("de"):
|
|
|
|
return extractnumber_de(text)
|
2018-02-15 11:51:45 +00:00
|
|
|
# TODO: extractnumber for other languages
|
2017-05-25 23:31:43 +00:00
|
|
|
return text
|
|
|
|
|
|
|
|
|
2017-07-10 20:33:21 +00:00
|
|
|
def extract_datetime(text, anchorDate=None, lang="en-us"):
|
|
|
|
"""
|
2018-07-11 09:53:47 +00:00
|
|
|
Extracts date and time information from a sentence. Parses many of the
|
|
|
|
common ways that humans express dates and times, including relative dates
|
|
|
|
like "5 days from today", "tomorrow', and "Tuesday".
|
2017-07-10 20:33:21 +00:00
|
|
|
|
|
|
|
Vague terminology are given arbitrary values, like:
|
|
|
|
- morning = 8 AM
|
|
|
|
- afternoon = 3 PM
|
|
|
|
- evening = 7 PM
|
|
|
|
|
2018-07-11 09:53:47 +00:00
|
|
|
If a time isn't supplied or implied, the function defaults to 12 AM
|
2017-07-10 20:33:21 +00:00
|
|
|
|
|
|
|
Args:
|
2018-07-11 09:53:47 +00:00
|
|
|
text (str): the text to be interpreted
|
|
|
|
anchorDate (:obj:`datetime`, optional): the date to be used for
|
2017-07-10 20:33:21 +00:00
|
|
|
relative dating (for example, what does "tomorrow" mean?).
|
2018-07-11 09:53:47 +00:00
|
|
|
Defaults to the current local date/time.
|
|
|
|
lang (string): the BCP-47 code for the language to use
|
2017-07-10 20:33:21 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
[:obj:`datetime`, :obj:`str`]: 'datetime' is the extracted date
|
2018-07-11 09:53:47 +00:00
|
|
|
as a datetime object in the user's local timezone.
|
2017-07-10 20:33:21 +00:00
|
|
|
'leftover_string' is the original phrase with all date and time
|
|
|
|
related keywords stripped out. See examples for further
|
|
|
|
clarification
|
|
|
|
|
2018-07-02 07:44:28 +00:00
|
|
|
Returns 'None' if the input string is empty.
|
2017-07-10 20:33:21 +00:00
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
|
|
|
>>> extract_datetime(
|
|
|
|
... "What is the weather like the day after tomorrow?",
|
|
|
|
... datetime(2017, 06, 30, 00, 00)
|
|
|
|
... )
|
|
|
|
[datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like']
|
|
|
|
|
|
|
|
>>> extract_datetime(
|
|
|
|
... "Set up an appointment 2 weeks from Sunday at 5 pm",
|
|
|
|
... datetime(2016, 02, 19, 00, 00)
|
|
|
|
... )
|
|
|
|
[datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment']
|
|
|
|
"""
|
|
|
|
|
|
|
|
lang_lower = str(lang).lower()
|
|
|
|
|
2018-07-11 09:53:47 +00:00
|
|
|
if not anchorDate:
|
|
|
|
anchorDate = now_local()
|
|
|
|
|
2017-07-10 20:33:21 +00:00
|
|
|
if lang_lower.startswith("en"):
|
|
|
|
return extract_datetime_en(text, anchorDate)
|
2017-10-26 23:18:00 +00:00
|
|
|
elif lang_lower.startswith("pt"):
|
|
|
|
return extract_datetime_pt(text, anchorDate)
|
2018-01-27 14:31:36 +00:00
|
|
|
elif lang_lower.startswith("it"):
|
|
|
|
return extract_datetime_it(text, anchorDate)
|
2018-02-08 20:46:24 +00:00
|
|
|
elif lang_lower.startswith("fr"):
|
|
|
|
return extract_datetime_fr(text, anchorDate)
|
2018-01-31 15:57:09 +00:00
|
|
|
elif lang_lower.startswith("sv"):
|
2018-02-15 11:53:51 +00:00
|
|
|
return extract_datetime_sv(text, anchorDate)
|
2018-06-12 06:55:21 +00:00
|
|
|
elif lang_lower.startswith("de"):
|
|
|
|
return extract_datetime_de(text, anchorDate)
|
2018-02-15 11:51:45 +00:00
|
|
|
# TODO: extract_datetime for other languages
|
2017-07-10 20:33:21 +00:00
|
|
|
return text
|
2017-05-25 23:31:43 +00:00
|
|
|
# ==============================================================
|
2017-02-23 12:40:46 +00:00
|
|
|
|
2017-05-30 19:30:41 +00:00
|
|
|
|
2017-02-23 12:40:46 +00:00
|
|
|
def normalize(text, lang="en-us", remove_articles=True):
|
|
|
|
"""Prepare a string for parsing
|
|
|
|
|
|
|
|
This function prepares the given text for parsing by making
|
|
|
|
numbers consistent, getting rid of contractions, etc.
|
2017-02-25 05:59:00 +00:00
|
|
|
Args:
|
|
|
|
text (str): the string to normalize
|
|
|
|
lang (str): the code for the language text is in
|
2018-07-02 07:44:28 +00:00
|
|
|
remove_articles (bool): whether to remove articles (like 'a', or
|
|
|
|
'the'). True by default.
|
2017-02-25 05:59:00 +00:00
|
|
|
Returns:
|
|
|
|
(str): The normalized string.
|
2017-02-23 12:40:46 +00:00
|
|
|
"""
|
2017-05-05 11:27:35 +00:00
|
|
|
|
|
|
|
lang_lower = str(lang).lower()
|
|
|
|
if lang_lower.startswith("en"):
|
2017-02-23 12:40:46 +00:00
|
|
|
return normalize_en(text, remove_articles)
|
2017-05-05 11:27:35 +00:00
|
|
|
elif lang_lower.startswith("es"):
|
2017-05-03 09:37:00 +00:00
|
|
|
return normalize_es(text, remove_articles)
|
2017-10-26 23:18:00 +00:00
|
|
|
elif lang_lower.startswith("pt"):
|
|
|
|
return normalize_pt(text, remove_articles)
|
2018-01-25 14:58:32 +00:00
|
|
|
elif lang_lower.startswith("it"):
|
|
|
|
return normalize_it(text, remove_articles)
|
2018-02-08 20:46:24 +00:00
|
|
|
elif lang_lower.startswith("fr"):
|
|
|
|
return normalize_fr(text, remove_articles)
|
2018-01-31 15:57:09 +00:00
|
|
|
elif lang_lower.startswith("sv"):
|
|
|
|
return normalize_sv(text, remove_articles)
|
2018-06-12 06:55:21 +00:00
|
|
|
elif lang_lower.startswith("de"):
|
|
|
|
return normalize_de(text, remove_articles)
|
2017-02-23 12:40:46 +00:00
|
|
|
# TODO: Normalization for other languages
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
2017-12-30 00:14:28 +00:00
|
|
|
def get_gender(word, input_string="", lang="en-us"):
|
|
|
|
'''
|
|
|
|
guess gender of word, optionally use raw input text for context
|
|
|
|
returns "m" if the word is male, "f" if female, False if unknown
|
|
|
|
'''
|
|
|
|
if "pt" in lang or "es" in lang:
|
|
|
|
# spanish follows same rules
|
|
|
|
return get_gender_pt(word, input_string)
|
2018-01-25 14:58:32 +00:00
|
|
|
elif "it" in lang:
|
|
|
|
return get_gender_it(word, input_string)
|
2017-10-26 23:18:00 +00:00
|
|
|
return False
|