mycroft-core/mycroft/util/parse.py

218 lines
7.8 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
#
Change to Apache 2.0 license from GPLv3.0 This commit officially switches the mycroft-core repository from GPLv3.0 licensing to Apache 2.0. All dependencies on GPL'ed code have been removed and we have contacted all previous contributors with still-existing code in the repository to agree to this change. Going forward, all contributors will sign a Contributor License Agreement (CLA) by visiting https://mycroft.ai/cla, then they will be included in the Mycroft Project's overall Contributor list, found at: https://github.com/MycroftAI/contributors. This cleanly protects the project, the contributor and all who use the technology to build upon. Futher discussion can be found at this blog post: https://mycroft.ai/blog/right-license/ This commit also removes all __author__="" from the code. These lines are painful to maintain and the etiquette surrounding their maintainence is unclear. Do you remove a name from the list if the last line of code the wrote gets replaced? Etc. Now all contributors are publicly acknowledged in the aforementioned repo, and actual authorship is maintained by Github in a much more effective and elegant way! Finally, a few references to "Mycroft AI" were changed to the correct legal entity name "Mycroft AI Inc." ==== Fixed Issues ==== #403 Update License.md and file headers to Apache 2.0 #400 Update LICENSE.md ==== Documentation Notes ==== Deprecated the ScheduledSkill and ScheduledCRUDSkill classes. These capabilities have been superceded by the more flexible MycroftSkill class methods schedule_event(), schedule_repeating_event(), update_event(), and cancel_event().
2017-10-04 06:28:44 +00:00
# Copyright 2017 Mycroft AI Inc.
#
Change to Apache 2.0 license from GPLv3.0 This commit officially switches the mycroft-core repository from GPLv3.0 licensing to Apache 2.0. All dependencies on GPL'ed code have been removed and we have contacted all previous contributors with still-existing code in the repository to agree to this change. Going forward, all contributors will sign a Contributor License Agreement (CLA) by visiting https://mycroft.ai/cla, then they will be included in the Mycroft Project's overall Contributor list, found at: https://github.com/MycroftAI/contributors. This cleanly protects the project, the contributor and all who use the technology to build upon. Futher discussion can be found at this blog post: https://mycroft.ai/blog/right-license/ This commit also removes all __author__="" from the code. These lines are painful to maintain and the etiquette surrounding their maintainence is unclear. Do you remove a name from the list if the last line of code the wrote gets replaced? Etc. Now all contributors are publicly acknowledged in the aforementioned repo, and actual authorship is maintained by Github in a much more effective and elegant way! Finally, a few references to "Mycroft AI" were changed to the correct legal entity name "Mycroft AI Inc." ==== Fixed Issues ==== #403 Update License.md and file headers to Apache 2.0 #400 Update LICENSE.md ==== Documentation Notes ==== Deprecated the ScheduledSkill and ScheduledCRUDSkill classes. These capabilities have been superceded by the more flexible MycroftSkill class methods schedule_event(), schedule_repeating_event(), update_event(), and cancel_event().
2017-10-04 06:28:44 +00:00
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
Change to Apache 2.0 license from GPLv3.0 This commit officially switches the mycroft-core repository from GPLv3.0 licensing to Apache 2.0. All dependencies on GPL'ed code have been removed and we have contacted all previous contributors with still-existing code in the repository to agree to this change. Going forward, all contributors will sign a Contributor License Agreement (CLA) by visiting https://mycroft.ai/cla, then they will be included in the Mycroft Project's overall Contributor list, found at: https://github.com/MycroftAI/contributors. This cleanly protects the project, the contributor and all who use the technology to build upon. Futher discussion can be found at this blog post: https://mycroft.ai/blog/right-license/ This commit also removes all __author__="" from the code. These lines are painful to maintain and the etiquette surrounding their maintainence is unclear. Do you remove a name from the list if the last line of code the wrote gets replaced? Etc. Now all contributors are publicly acknowledged in the aforementioned repo, and actual authorship is maintained by Github in a much more effective and elegant way! Finally, a few references to "Mycroft AI" were changed to the correct legal entity name "Mycroft AI Inc." ==== Fixed Issues ==== #403 Update License.md and file headers to Apache 2.0 #400 Update LICENSE.md ==== Documentation Notes ==== Deprecated the ScheduledSkill and ScheduledCRUDSkill classes. These capabilities have been superceded by the more flexible MycroftSkill class methods schedule_event(), schedule_repeating_event(), update_event(), and cancel_event().
2017-10-04 06:28:44 +00:00
# http://www.apache.org/licenses/LICENSE-2.0
#
Change to Apache 2.0 license from GPLv3.0 This commit officially switches the mycroft-core repository from GPLv3.0 licensing to Apache 2.0. All dependencies on GPL'ed code have been removed and we have contacted all previous contributors with still-existing code in the repository to agree to this change. Going forward, all contributors will sign a Contributor License Agreement (CLA) by visiting https://mycroft.ai/cla, then they will be included in the Mycroft Project's overall Contributor list, found at: https://github.com/MycroftAI/contributors. This cleanly protects the project, the contributor and all who use the technology to build upon. Futher discussion can be found at this blog post: https://mycroft.ai/blog/right-license/ This commit also removes all __author__="" from the code. These lines are painful to maintain and the etiquette surrounding their maintainence is unclear. Do you remove a name from the list if the last line of code the wrote gets replaced? Etc. Now all contributors are publicly acknowledged in the aforementioned repo, and actual authorship is maintained by Github in a much more effective and elegant way! Finally, a few references to "Mycroft AI" were changed to the correct legal entity name "Mycroft AI Inc." ==== Fixed Issues ==== #403 Update License.md and file headers to Apache 2.0 #400 Update LICENSE.md ==== Documentation Notes ==== Deprecated the ScheduledSkill and ScheduledCRUDSkill classes. These capabilities have been superceded by the more flexible MycroftSkill class methods schedule_event(), schedule_repeating_event(), update_event(), and cancel_event().
2017-10-04 06:28:44 +00:00
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from difflib import SequenceMatcher
from mycroft.util.time import now_local
from mycroft.util.lang.parse_en import *
from mycroft.util.lang.parse_pt import *
from mycroft.util.lang.parse_es import *
from mycroft.util.lang.parse_it import *
from mycroft.util.lang.parse_sv import *
from mycroft.util.lang.parse_de import extractnumber_de
from mycroft.util.lang.parse_de import extract_datetime_de
from mycroft.util.lang.parse_de import normalize_de
from mycroft.util.lang.parse_fr import extractnumber_fr
from mycroft.util.lang.parse_fr import extract_datetime_fr
from mycroft.util.lang.parse_fr import normalize_fr
from mycroft.util.lang.parse_common import *
def fuzzy_match(x, against):
"""Perform a 'fuzzy' comparison between two strings.
Returns:
float: match percentage -- 1.0 for perfect match,
down to 0.0 for no match at all.
"""
return SequenceMatcher(None, x, against).ratio()
def match_one(query, choices):
"""
Find best match from a list or dictionary given an input
Arguments:
query: string to test
choices: list or dictionary of choices
Returns: tuple with best match, score
"""
if isinstance(choices, dict):
_choices = list(choices.keys())
elif isinstance(choices, list):
_choices = choices
else:
raise ValueError('a list or dict of choices must be provided')
best = (_choices[0], fuzzy_match(query, _choices[0]))
for c in _choices[1:]:
score = fuzzy_match(query, c)
if score > best[1]:
best = (c, score)
if isinstance(choices, dict):
return (choices[best[0]], best[1])
else:
return best
def extract_number(text, short_scale=True, ordinals=False, lang="en-us"):
2017-05-25 23:31:43 +00:00
"""Takes in a string and extracts a number.
2017-05-25 23:31:43 +00:00
Args:
text (str): the string to extract a number from
short_scale (bool): Use "short scale" or "long scale" for large
numbers -- over a million. The default is short scale, which
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
lang (str): the BCP-47 code for the language to use
2017-05-25 23:31:43 +00:00
Returns:
(int, float or False): The number extracted or False if the input
text contains no numbers
2017-05-25 23:31:43 +00:00
"""
lang_lower = str(lang).lower()
if lang_lower.startswith("en"):
return extractnumber_en(text, short_scale=short_scale,
ordinals=ordinals)
elif lang_lower.startswith("pt"):
return extractnumber_pt(text)
elif lang_lower.startswith("it"):
return extractnumber_it(text)
elif lang_lower.startswith("fr"):
return extractnumber_fr(text)
elif lang_lower.startswith("sv"):
return extractnumber_sv(text)
elif lang_lower.startswith("de"):
return extractnumber_de(text)
# TODO: extractnumber_xx for other languages
2017-05-25 23:31:43 +00:00
return text
def extract_datetime(text, anchorDate=None, lang="en-us", default_time=None):
"""
Extracts date and time information from a sentence. Parses many of the
common ways that humans express dates and times, including relative dates
like "5 days from today", "tomorrow', and "Tuesday".
Vague terminology are given arbitrary values, like:
- morning = 8 AM
- afternoon = 3 PM
- evening = 7 PM
If a time isn't supplied or implied, the function defaults to 12 AM
Args:
text (str): the text to be interpreted
anchorDate (:obj:`datetime`, optional): the date to be used for
relative dating (for example, what does "tomorrow" mean?).
Defaults to the current local date/time.
lang (string): the BCP-47 code for the language to use
default_time (datetime.time): time to use if none was found in
the input string.
Returns:
[:obj:`datetime`, :obj:`str`]: 'datetime' is the extracted date
as a datetime object in the user's local timezone.
'leftover_string' is the original phrase with all date and time
related keywords stripped out. See examples for further
clarification
Returns 'None' if the input string is empty.
Examples:
>>> extract_datetime(
... "What is the weather like the day after tomorrow?",
... datetime(2017, 06, 30, 00, 00)
... )
[datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like']
>>> extract_datetime(
... "Set up an appointment 2 weeks from Sunday at 5 pm",
... datetime(2016, 02, 19, 00, 00)
... )
[datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment']
"""
lang_lower = str(lang).lower()
if not anchorDate:
anchorDate = now_local()
if lang_lower.startswith("en"):
return extract_datetime_en(text, anchorDate, default_time)
elif lang_lower.startswith("pt"):
return extract_datetime_pt(text, anchorDate, default_time)
elif lang_lower.startswith("it"):
return extract_datetime_it(text, anchorDate, default_time)
elif lang_lower.startswith("fr"):
return extract_datetime_fr(text, anchorDate, default_time)
elif lang_lower.startswith("sv"):
return extract_datetime_sv(text, anchorDate, default_time)
elif lang_lower.startswith("de"):
return extract_datetime_de(text, anchorDate, default_time)
2018-02-15 11:51:45 +00:00
# TODO: extract_datetime for other languages
return text
2017-05-25 23:31:43 +00:00
# ==============================================================
def normalize(text, lang="en-us", remove_articles=True):
"""Prepare a string for parsing
This function prepares the given text for parsing by making
numbers consistent, getting rid of contractions, etc.
Args:
text (str): the string to normalize
lang (str): the code for the language text is in
remove_articles (bool): whether to remove articles (like 'a', or
'the'). True by default.
Returns:
(str): The normalized string.
"""
2017-05-05 11:27:35 +00:00
lang_lower = str(lang).lower()
if lang_lower.startswith("en"):
return normalize_en(text, remove_articles)
2017-05-05 11:27:35 +00:00
elif lang_lower.startswith("es"):
2017-05-03 09:37:00 +00:00
return normalize_es(text, remove_articles)
elif lang_lower.startswith("pt"):
return normalize_pt(text, remove_articles)
elif lang_lower.startswith("it"):
return normalize_it(text, remove_articles)
elif lang_lower.startswith("fr"):
return normalize_fr(text, remove_articles)
elif lang_lower.startswith("sv"):
return normalize_sv(text, remove_articles)
elif lang_lower.startswith("de"):
return normalize_de(text, remove_articles)
# TODO: Normalization for other languages
return text
def get_gender(word, input_string="", lang="en-us"):
'''
guess gender of word, optionally use raw input text for context
returns "m" if the word is male, "f" if female, False if unknown
'''
if "pt" in lang or "es" in lang:
# spanish follows same rules
return get_gender_pt(word, input_string)
elif "it" in lang:
return get_gender_it(word, input_string)
return False