mycroft-core/mycroft/skills/skill_data.py

686 lines
22 KiB
Python

# Copyright 2018 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Handling of skill data such as intents and regular expressions."""
import re
from collections import namedtuple
from os import walk
from pathlib import Path
from typing import List, Optional, Tuple
from mycroft.util.file_utils import resolve_resource_file
from mycroft.util.format import expand_options
from mycroft.util.log import LOG
SkillResourceTypes = namedtuple(
"SkillResourceTypes",
[
"dialog",
"entity",
"intent",
"list",
"named_value",
"regex",
"template",
"vocabulary",
"word",
],
)
class ResourceType:
"""Defines the attributes of a type of skill resource.
Examples:
dialog = ResourceType("dialog", ".dialog")
dialog.locate_base_directory(self.root_dir, self.lang)
named_value = ResourceType("named_value", ".value")
named_value.locate_base_directory(self.root_dir, self.lang)
Attributes:
resource_type: one of a predefined set of resource types for skills
file_extension: the file extension associated with the resource type
base_directory: directory containing all files for the resource type
"""
def __init__(self, resource_type: str, file_extension: str, language: str):
self.resource_type = resource_type
self.file_extension = file_extension
self.language = language
self.base_directory = None
def locate_base_directory(self, skill_directory):
"""Find the skill's base directory for the specified resource type.
There are three supported methodologies for storing resource files.
The preferred method is to use the "locale" directory but older methods
are included in the search for backwards compatibility. The three
directory schemes are:
<skill>/locale/<lang>/.../<resource_type>
<skill>/<resource_subdirectory>/<lang>/
<skill>/<resource_subdirectory>
If the directory for the specified language doesn't exist, fall back to
the default "en-us".
Args:
skill_directory: the root directory of a skill
Returns:
the skill's directory for the resource type or None if not found
"""
resource_subdirectory = self._get_resource_subdirectory()
possible_directories = (
Path(skill_directory, "locale", self.language),
Path(skill_directory, "locale", "en-us"),
Path(skill_directory, resource_subdirectory, self.language),
Path(skill_directory, resource_subdirectory, "en-us"),
Path(skill_directory, resource_subdirectory),
)
for directory in possible_directories:
if directory.exists():
self.base_directory = directory
if "en-us" in str(directory) and self.language != "en-us":
self.language = "en-us"
break
def _get_resource_subdirectory(self) -> str:
"""Returns the subdirectory for this resource type.
In the older directory schemes, several resource types were stored
in the same set of three directories (dialog, regex, vocab).
"""
subdirectories = dict(
dialog="dialog",
entity="vocab",
intent="vocab",
list="dialog",
named_value="dialog",
regex="regex",
template="dialog",
vocab="vocab",
word="dialog",
)
return subdirectories[self.resource_type]
class ResourceFile:
"""Loads a resource file for the user's configured language.
Attributes:
resource_type: attributes of the resource type (dialog, vocab, etc.)
resource_name: file name of the resource, with or without extension
file_path: absolute path to the file
"""
def __init__(self, resource_type, resource_name):
self.resource_type = resource_type
self.resource_name = resource_name
self.file_path = self._locate()
def _locate(self):
"""Locates a resource file in the skill's locale directory.
A skill's locale directory can contain a subdirectory structure defined
by the skill author. Walk the directory and any subdirectories to
find the resource file.
"""
file_path = None
if self.resource_name.endswith(self.resource_type.file_extension):
file_name = self.resource_name
else:
file_name = self.resource_name + self.resource_type.file_extension
walk_directory = str(self.resource_type.base_directory)
for directory, _, file_names in walk(walk_directory):
if file_name in file_names:
file_path = Path(directory, file_name)
if file_path is None:
sub_path = Path("text", self.resource_type.language, file_name)
file_path = resolve_resource_file(str(sub_path))
if file_path is None:
LOG.error(f"Could not find resource file {file_name}")
return file_path
def load(self):
"""Override in subclass to define resource type loading behavior."""
pass
def _read(self) -> str:
"""Reads the specified file, removing comment and empty lines."""
with open(self.file_path) as resource_file:
for line in [line.strip() for line in resource_file.readlines()]:
if not line or line.startswith("#"):
continue
yield line
class DialogFile(ResourceFile):
"""Defines a dialog file, which is used instruct TTS what to speak."""
def __init__(self, resource_type, resource_name):
super().__init__(resource_type, resource_name)
self.data = None
def load(self) -> List[str]:
"""Load and lines from a file and populate the variables.
Returns:
Contents of the file with variables resolved.
"""
dialogs = None
if self.file_path is not None:
dialogs = []
for line in self._read():
line = line.replace("{{", "{").replace("}}", "}")
if self.data is not None:
line = line.format(**self.data)
dialogs.append(line)
return dialogs
def render(self, dialog_renderer):
"""Renders a random phrase from a dialog file.
If no file is found, the requested phrase is returned as the string. This
will use the default language for translations.
Returns:
str: a randomized version of the phrase
"""
return dialog_renderer.render(self.resource_name, self.data)
class VocabularyFile(ResourceFile):
"""Defines a vocabulary file, which skill use to form intents."""
def load(self) -> List[List[str]]:
"""Loads a vocabulary file.
If a record in a vocabulary file contains sets of words inside
parentheses, generate a vocabulary item for each permutation within
the parentheses.
Returns:
List of lines in the file. Each item in the list is a list of
strings that represent different options based on regular
expression.
"""
vocabulary = []
if self.file_path is not None:
for line in self._read():
vocabulary.append(expand_options(line.lower()))
return vocabulary
class NamedValueFile(ResourceFile):
"""Defines a named value file, which maps a variable to a values."""
def __init__(self, resource_type, resource_name):
super().__init__(resource_type, resource_name)
self.delimiter = ","
def load(self) -> dict:
"""Load file containing names and values.
Returns:
A dictionary representation of the records in the file.
"""
named_values = dict()
if self.file_path is not None:
for line in self._read():
name, value = self._load_line(line)
if name is not None and value is not None:
named_values[name] = value
return named_values
def _load_line(self, line: str) -> Tuple[str, str]:
"""Attempts to split the name and value for dictionary loading.
Args:
line: a record in a .value file
Returns:
The name/value pair that will be loaded into a dictionary.
"""
name = None
value = None
try:
name, value = line.split(self.delimiter)
except ValueError:
LOG.exception(
f"Failed to load value file {self.file_path} "
f"record containing {line}"
)
return name, value
class ListFile(DialogFile):
pass
class TemplateFile(DialogFile):
pass
class RegexFile(ResourceFile):
def load(self):
regex_patterns = []
if self.file_path:
regex_patterns = [line for line in self._read()]
return regex_patterns
class WordFile(ResourceFile):
"""Defines a word file, which defines a word in the configured language."""
def load(self) -> Optional[str]:
"""Load and lines from a file and populate the variables.
Returns:
The word contained in the file
"""
word = None
if self.file_path is not None:
for line in self._read():
word = line
break
return word
class SkillResources:
def __init__(self, skill_directory, language, dialog_renderer):
self.skill_directory = skill_directory
self.language = language
self.types = self._define_resource_types()
self.dialog_renderer = dialog_renderer
self.static = dict()
def _define_resource_types(self) -> SkillResourceTypes:
"""Defines all known types of skill resource files.
A resource file contains information the skill needs to function.
Examples include dialog files to be spoken and vocab files for intent
matching.
"""
resource_types = dict(
dialog=ResourceType("dialog", ".dialog", self.language),
entity=ResourceType("entity", ".entity", self.language),
intent=ResourceType("intent", ".intent", self.language),
list=ResourceType("list", ".list", self.language),
named_value=ResourceType("named_value", ".value", self.language),
regex=ResourceType("regex", ".rx", self.language),
template=ResourceType("template", ".template", self.language),
vocabulary=ResourceType("vocab", ".voc", self.language),
word=ResourceType("word", ".word", self.language),
)
for resource_type in resource_types.values():
resource_type.locate_base_directory(self.skill_directory)
return SkillResourceTypes(**resource_types)
def load_dialog_file(self, name, data=None) -> List[str]:
"""Loads the contents of a dialog file into memory.
Named variables in the dialog are populated with values found in the
data dictionary.
Args:
name: name of the dialog file (no extension needed)
data: keyword arguments used to populate variables
Returns:
A list of phrases with variables resolved
"""
dialog_file = DialogFile(self.types.dialog, name)
dialog_file.data = data
return dialog_file.load()
def load_list_file(self, name, data=None) -> List[str]:
"""Load a file containing a list of words or phrases
Named variables in the dialog are populated with values found in the
data dictionary.
Args:
name: name of the list file (no extension needed)
data: keyword arguments used to populate variables
Returns:
List of words or phrases read from the list file.
"""
list_file = ListFile(self.types.list, name)
list_file.data = data
return list_file.load()
def load_named_value_file(self, name, delimiter=None) -> dict:
"""Load file containing a set names and values.
Loads a simple delimited file of name/value pairs.
The name is the first item, the value is the second.
Args:
name: name of the .value file, no extension needed
delimiter: delimiter character used
Returns:
File contents represented as a dictionary
"""
if name in self.static:
named_values = self.static[name]
else:
named_value_file = NamedValueFile(self.types.named_value, name)
if delimiter is not None:
named_value_file.delimiter = delimiter
named_values = named_value_file.load()
self.static[name] = named_values
return named_values
def load_regex_file(self, name) -> List[str]:
"""Loads a file containing regular expression patterns.
The regular expression patterns are generally used to find a value
in a user utterance the skill needs to properly perform the requested
function.
Args:
name: name of the regular expression file, no extension needed
Returns:
List representation of the regular expression file.
"""
regex_file = RegexFile(self.types.regex, name)
return regex_file.load()
def load_template_file(self, name, data=None) -> List[str]:
"""Loads the contents of a dialog file into memory.
Named variables in the dialog are populated with values found in the
data dictionary.
Args:
name: name of the dialog file (no extension needed)
data: keyword arguments used to populate variables
Returns:
A list of phrases with variables resolved
"""
template_file = TemplateFile(self.types.template, name)
template_file.data = data
return template_file.load()
def load_vocabulary_file(self, name) -> List[List[str]]:
"""Loads a file containing variations of words meaning the same thing.
A vocabulary file defines words a skill uses for intent matching.
It can also be used to match words in an utterance after intent
intent matching is complete.
Args:
name: name of the regular expression file, no extension needed
Returns:
List representation of the regular expression file.
"""
vocabulary_file = VocabularyFile(self.types.vocabulary, name)
return vocabulary_file.load()
def load_word_file(self, name) -> Optional[str]:
"""Loads a file containing a word.
Args:
name: name of the regular expression file, no extension needed
Returns:
List representation of the regular expression file.
"""
word_file = WordFile(self.types.word, name)
return word_file.load()
def render_dialog(self, name, data=None) -> str:
"""Selects a record from a dialog file at random for TTS purposes.
Args:
name: name of the list file (no extension needed)
data: keyword arguments used to populate variables
Returns:
Random record from the file with variables resolved.
"""
resource_file = DialogFile(self.types.dialog, name)
resource_file.data = data
return resource_file.render(self.dialog_renderer)
def load_skill_vocabulary(self, alphanumeric_skill_id: str) -> dict:
skill_vocabulary = {}
base_directory = self.types.vocabulary.base_directory
for directory, _, files in walk(base_directory):
vocabulary_files = [
file_name for file_name in files if file_name.endswith(".voc")
]
for file_name in vocabulary_files:
vocab_type = alphanumeric_skill_id + file_name[:-4].title()
vocabulary = self.load_vocabulary_file(file_name)
if vocabulary:
skill_vocabulary[vocab_type] = vocabulary
return skill_vocabulary
def load_skill_regex(self, alphanumeric_skill_id: str) -> List[str]:
skill_regexes = []
base_directory = self.types.regex.base_directory
for directory, _, files in walk(base_directory):
regex_files = [
file_name for file_name in files if file_name.endswith(".rx")
]
for file_name in regex_files:
skill_regexes.extend(self.load_regex_file(file_name))
skill_regexes = self._make_unique_regex_group(
skill_regexes, alphanumeric_skill_id
)
return skill_regexes
@staticmethod
def _make_unique_regex_group(
regexes: List[str], alphanumeric_skill_id: str
) -> List[str]:
"""Adds skill ID to group ID in a regular expression for uniqueness.
Args:
regexes: regex string
alphanumeric_skill_id: skill identifier
Returns:
regular expressions with uniquely named group IDs
Raises:
re.error if the regex does not compile
"""
modified_regexes = []
for regex in regexes:
base = "(?P<" + alphanumeric_skill_id
modified_regex = base.join(regex.split("(?P<"))
re.compile(modified_regex)
modified_regexes.append(modified_regex)
return modified_regexes
class RegexExtractor:
"""Extracts data from an utterance using regular expressions.
Attributes:
group_name:
regex_patterns: regular expressions read from a .rx file
"""
def __init__(self, group_name, regex_patterns):
self.group_name = group_name
self.regex_patterns = regex_patterns
def extract(self, utterance) -> Optional[str]:
"""Attempt to find a value in a user request.
Args:
utterance: request spoken by the user
Returns:
The value extracted from the utterance, if found
"""
extract = None
pattern_match = self._match_utterance_to_patterns(utterance)
if pattern_match is not None:
extract = self._extract_group_from_match(pattern_match)
self._log_extraction_result(extract)
return extract
def _match_utterance_to_patterns(self, utterance: str):
"""Match regular expressions to user request.
Args:
utterance: request spoken by the user
Returns:
a regular expression match object if a match is found
"""
pattern_match = None
for pattern in self.regex_patterns:
pattern_match = re.search(pattern, utterance)
if pattern_match:
break
return pattern_match
def _extract_group_from_match(self, pattern_match):
"""Extract the alarm name from the utterance.
Args:
pattern_match: a regular expression match object
"""
extract = None
try:
extract = pattern_match.group(self.group_name).strip()
except IndexError:
pass
else:
if not extract:
extract = None
return extract
def _log_extraction_result(self, extract: str):
"""Log the results of the matching.
Args:
extract: the value extracted from the user utterance
"""
if extract is None:
LOG.info(f"No {self.group_name.lower()} extracted from utterance")
else:
LOG.info(f"{self.group_name} extracted from utterance: " + extract)
def to_alnum(skill_id):
"""Convert a skill id to only alphanumeric characters
Non alpha-numeric characters are converted to "_"
Args:
skill_id (str): identifier to be converted
Returns:
(str) String of letters
"""
return "".join(c if c.isalnum() else "_" for c in str(skill_id))
def munge_regex(regex, skill_id):
"""Insert skill id as letters into match groups.
Args:
regex (str): regex string
skill_id (str): skill identifier
Returns:
(str) munged regex
"""
base = "(?P<" + to_alnum(skill_id)
return base.join(regex.split("(?P<"))
def munge_intent_parser(intent_parser, name, skill_id):
"""Rename intent keywords to make them skill exclusive
This gives the intent parser an exclusive name in the
format <skill_id>:<name>. The keywords are given unique
names in the format <Skill id as letters><Intent name>.
The function will not munge instances that's already been
munged
Args:
intent_parser: (IntentParser) object to update
name: (str) Skill name
skill_id: (int) skill identifier
"""
# Munge parser name
if not name.startswith(str(skill_id) + ":"):
intent_parser.name = str(skill_id) + ":" + name
else:
intent_parser.name = name
# Munge keywords
skill_id = to_alnum(skill_id)
# Munge required keyword
reqs = []
for i in intent_parser.requires:
if not i[0].startswith(skill_id):
kw = (skill_id + i[0], skill_id + i[0])
reqs.append(kw)
else:
reqs.append(i)
intent_parser.requires = reqs
# Munge optional keywords
opts = []
for i in intent_parser.optional:
if not i[0].startswith(skill_id):
kw = (skill_id + i[0], skill_id + i[0])
opts.append(kw)
else:
opts.append(i)
intent_parser.optional = opts
# Munge at_least_one keywords
at_least_one = []
for i in intent_parser.at_least_one:
element = [skill_id + e.replace(skill_id, "") for e in i]
at_least_one.append(tuple(element))
intent_parser.at_least_one = at_least_one
# Munge excluded keywords
excludes = []
for i in intent_parser.excludes:
if not i.startswith(skill_id):
kw = skill_id + i
excludes.append(kw)
else:
excludes.append(i)
intent_parser.excludes = excludes