# Copyright 2018 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """Handling of skill data such as intents and regular expressions.""" import re from collections import namedtuple from os import walk from pathlib import Path from typing import List, Optional, Tuple from mycroft.util.file_utils import resolve_resource_file from mycroft.util.format import expand_options from mycroft.util.log import LOG SkillResourceTypes = namedtuple( "SkillResourceTypes", [ "dialog", "entity", "intent", "list", "named_value", "regex", "template", "vocabulary", "word", ], ) class ResourceType: """Defines the attributes of a type of skill resource. Examples: dialog = ResourceType("dialog", ".dialog") dialog.locate_base_directory(self.root_dir, self.lang) named_value = ResourceType("named_value", ".value") named_value.locate_base_directory(self.root_dir, self.lang) Attributes: resource_type: one of a predefined set of resource types for skills file_extension: the file extension associated with the resource type base_directory: directory containing all files for the resource type """ def __init__(self, resource_type: str, file_extension: str, language: str): self.resource_type = resource_type self.file_extension = file_extension self.language = language self.base_directory = None def locate_base_directory(self, skill_directory): """Find the skill's base directory for the specified resource type. There are three supported methodologies for storing resource files. The preferred method is to use the "locale" directory but older methods are included in the search for backwards compatibility. The three directory schemes are: /locale//.../ /// / If the directory for the specified language doesn't exist, fall back to the default "en-us". Args: skill_directory: the root directory of a skill Returns: the skill's directory for the resource type or None if not found """ resource_subdirectory = self._get_resource_subdirectory() possible_directories = ( Path(skill_directory, "locale", self.language), Path(skill_directory, "locale", "en-us"), Path(skill_directory, resource_subdirectory, self.language), Path(skill_directory, resource_subdirectory, "en-us"), Path(skill_directory, resource_subdirectory), ) for directory in possible_directories: if directory.exists(): self.base_directory = directory if "en-us" in str(directory) and self.language != "en-us": self.language = "en-us" break def _get_resource_subdirectory(self) -> str: """Returns the subdirectory for this resource type. In the older directory schemes, several resource types were stored in the same set of three directories (dialog, regex, vocab). """ subdirectories = dict( dialog="dialog", entity="vocab", intent="vocab", list="dialog", named_value="dialog", regex="regex", template="dialog", vocab="vocab", word="dialog", ) return subdirectories[self.resource_type] class ResourceFile: """Loads a resource file for the user's configured language. Attributes: resource_type: attributes of the resource type (dialog, vocab, etc.) resource_name: file name of the resource, with or without extension file_path: absolute path to the file """ def __init__(self, resource_type, resource_name): self.resource_type = resource_type self.resource_name = resource_name self.file_path = self._locate() def _locate(self): """Locates a resource file in the skill's locale directory. A skill's locale directory can contain a subdirectory structure defined by the skill author. Walk the directory and any subdirectories to find the resource file. """ file_path = None if self.resource_name.endswith(self.resource_type.file_extension): file_name = self.resource_name else: file_name = self.resource_name + self.resource_type.file_extension walk_directory = str(self.resource_type.base_directory) for directory, _, file_names in walk(walk_directory): if file_name in file_names: file_path = Path(directory, file_name) if file_path is None: sub_path = Path("text", self.resource_type.language, file_name) file_path = resolve_resource_file(str(sub_path)) if file_path is None: LOG.error(f"Could not find resource file {file_name}") return file_path def load(self): """Override in subclass to define resource type loading behavior.""" pass def _read(self) -> str: """Reads the specified file, removing comment and empty lines.""" with open(self.file_path) as resource_file: for line in [line.strip() for line in resource_file.readlines()]: if not line or line.startswith("#"): continue yield line class DialogFile(ResourceFile): """Defines a dialog file, which is used instruct TTS what to speak.""" def __init__(self, resource_type, resource_name): super().__init__(resource_type, resource_name) self.data = None def load(self) -> List[str]: """Load and lines from a file and populate the variables. Returns: Contents of the file with variables resolved. """ dialogs = None if self.file_path is not None: dialogs = [] for line in self._read(): line = line.replace("{{", "{").replace("}}", "}") if self.data is not None: line = line.format(**self.data) dialogs.append(line) return dialogs def render(self, dialog_renderer): """Renders a random phrase from a dialog file. If no file is found, the requested phrase is returned as the string. This will use the default language for translations. Returns: str: a randomized version of the phrase """ return dialog_renderer.render(self.resource_name, self.data) class VocabularyFile(ResourceFile): """Defines a vocabulary file, which skill use to form intents.""" def load(self) -> List[List[str]]: """Loads a vocabulary file. If a record in a vocabulary file contains sets of words inside parentheses, generate a vocabulary item for each permutation within the parentheses. Returns: List of lines in the file. Each item in the list is a list of strings that represent different options based on regular expression. """ vocabulary = [] if self.file_path is not None: for line in self._read(): vocabulary.append(expand_options(line.lower())) return vocabulary class NamedValueFile(ResourceFile): """Defines a named value file, which maps a variable to a values.""" def __init__(self, resource_type, resource_name): super().__init__(resource_type, resource_name) self.delimiter = "," def load(self) -> dict: """Load file containing names and values. Returns: A dictionary representation of the records in the file. """ named_values = dict() if self.file_path is not None: for line in self._read(): name, value = self._load_line(line) if name is not None and value is not None: named_values[name] = value return named_values def _load_line(self, line: str) -> Tuple[str, str]: """Attempts to split the name and value for dictionary loading. Args: line: a record in a .value file Returns: The name/value pair that will be loaded into a dictionary. """ name = None value = None try: name, value = line.split(self.delimiter) except ValueError: LOG.exception( f"Failed to load value file {self.file_path} " f"record containing {line}" ) return name, value class ListFile(DialogFile): pass class TemplateFile(DialogFile): pass class RegexFile(ResourceFile): def load(self): regex_patterns = [] if self.file_path: regex_patterns = [line for line in self._read()] return regex_patterns class WordFile(ResourceFile): """Defines a word file, which defines a word in the configured language.""" def load(self) -> Optional[str]: """Load and lines from a file and populate the variables. Returns: The word contained in the file """ word = None if self.file_path is not None: for line in self._read(): word = line break return word class SkillResources: def __init__(self, skill_directory, language, dialog_renderer): self.skill_directory = skill_directory self.language = language self.types = self._define_resource_types() self.dialog_renderer = dialog_renderer self.static = dict() def _define_resource_types(self) -> SkillResourceTypes: """Defines all known types of skill resource files. A resource file contains information the skill needs to function. Examples include dialog files to be spoken and vocab files for intent matching. """ resource_types = dict( dialog=ResourceType("dialog", ".dialog", self.language), entity=ResourceType("entity", ".entity", self.language), intent=ResourceType("intent", ".intent", self.language), list=ResourceType("list", ".list", self.language), named_value=ResourceType("named_value", ".value", self.language), regex=ResourceType("regex", ".rx", self.language), template=ResourceType("template", ".template", self.language), vocabulary=ResourceType("vocab", ".voc", self.language), word=ResourceType("word", ".word", self.language), ) for resource_type in resource_types.values(): resource_type.locate_base_directory(self.skill_directory) return SkillResourceTypes(**resource_types) def load_dialog_file(self, name, data=None) -> List[str]: """Loads the contents of a dialog file into memory. Named variables in the dialog are populated with values found in the data dictionary. Args: name: name of the dialog file (no extension needed) data: keyword arguments used to populate variables Returns: A list of phrases with variables resolved """ dialog_file = DialogFile(self.types.dialog, name) dialog_file.data = data return dialog_file.load() def load_list_file(self, name, data=None) -> List[str]: """Load a file containing a list of words or phrases Named variables in the dialog are populated with values found in the data dictionary. Args: name: name of the list file (no extension needed) data: keyword arguments used to populate variables Returns: List of words or phrases read from the list file. """ list_file = ListFile(self.types.list, name) list_file.data = data return list_file.load() def load_named_value_file(self, name, delimiter=None) -> dict: """Load file containing a set names and values. Loads a simple delimited file of name/value pairs. The name is the first item, the value is the second. Args: name: name of the .value file, no extension needed delimiter: delimiter character used Returns: File contents represented as a dictionary """ if name in self.static: named_values = self.static[name] else: named_value_file = NamedValueFile(self.types.named_value, name) if delimiter is not None: named_value_file.delimiter = delimiter named_values = named_value_file.load() self.static[name] = named_values return named_values def load_regex_file(self, name) -> List[str]: """Loads a file containing regular expression patterns. The regular expression patterns are generally used to find a value in a user utterance the skill needs to properly perform the requested function. Args: name: name of the regular expression file, no extension needed Returns: List representation of the regular expression file. """ regex_file = RegexFile(self.types.regex, name) return regex_file.load() def load_template_file(self, name, data=None) -> List[str]: """Loads the contents of a dialog file into memory. Named variables in the dialog are populated with values found in the data dictionary. Args: name: name of the dialog file (no extension needed) data: keyword arguments used to populate variables Returns: A list of phrases with variables resolved """ template_file = TemplateFile(self.types.template, name) template_file.data = data return template_file.load() def load_vocabulary_file(self, name) -> List[List[str]]: """Loads a file containing variations of words meaning the same thing. A vocabulary file defines words a skill uses for intent matching. It can also be used to match words in an utterance after intent intent matching is complete. Args: name: name of the regular expression file, no extension needed Returns: List representation of the regular expression file. """ vocabulary_file = VocabularyFile(self.types.vocabulary, name) return vocabulary_file.load() def load_word_file(self, name) -> Optional[str]: """Loads a file containing a word. Args: name: name of the regular expression file, no extension needed Returns: List representation of the regular expression file. """ word_file = WordFile(self.types.word, name) return word_file.load() def render_dialog(self, name, data=None) -> str: """Selects a record from a dialog file at random for TTS purposes. Args: name: name of the list file (no extension needed) data: keyword arguments used to populate variables Returns: Random record from the file with variables resolved. """ resource_file = DialogFile(self.types.dialog, name) resource_file.data = data return resource_file.render(self.dialog_renderer) def load_skill_vocabulary(self, alphanumeric_skill_id: str) -> dict: skill_vocabulary = {} base_directory = self.types.vocabulary.base_directory for directory, _, files in walk(base_directory): vocabulary_files = [ file_name for file_name in files if file_name.endswith(".voc") ] for file_name in vocabulary_files: vocab_type = alphanumeric_skill_id + file_name[:-4].title() vocabulary = self.load_vocabulary_file(file_name) if vocabulary: skill_vocabulary[vocab_type] = vocabulary return skill_vocabulary def load_skill_regex(self, alphanumeric_skill_id: str) -> List[str]: skill_regexes = [] base_directory = self.types.regex.base_directory for directory, _, files in walk(base_directory): regex_files = [ file_name for file_name in files if file_name.endswith(".rx") ] for file_name in regex_files: skill_regexes.extend(self.load_regex_file(file_name)) skill_regexes = self._make_unique_regex_group( skill_regexes, alphanumeric_skill_id ) return skill_regexes @staticmethod def _make_unique_regex_group( regexes: List[str], alphanumeric_skill_id: str ) -> List[str]: """Adds skill ID to group ID in a regular expression for uniqueness. Args: regexes: regex string alphanumeric_skill_id: skill identifier Returns: regular expressions with uniquely named group IDs Raises: re.error if the regex does not compile """ modified_regexes = [] for regex in regexes: base = "(?P<" + alphanumeric_skill_id modified_regex = base.join(regex.split("(?P<")) re.compile(modified_regex) modified_regexes.append(modified_regex) return modified_regexes class RegexExtractor: """Extracts data from an utterance using regular expressions. Attributes: group_name: regex_patterns: regular expressions read from a .rx file """ def __init__(self, group_name, regex_patterns): self.group_name = group_name self.regex_patterns = regex_patterns def extract(self, utterance) -> Optional[str]: """Attempt to find a value in a user request. Args: utterance: request spoken by the user Returns: The value extracted from the utterance, if found """ extract = None pattern_match = self._match_utterance_to_patterns(utterance) if pattern_match is not None: extract = self._extract_group_from_match(pattern_match) self._log_extraction_result(extract) return extract def _match_utterance_to_patterns(self, utterance: str): """Match regular expressions to user request. Args: utterance: request spoken by the user Returns: a regular expression match object if a match is found """ pattern_match = None for pattern in self.regex_patterns: pattern_match = re.search(pattern, utterance) if pattern_match: break return pattern_match def _extract_group_from_match(self, pattern_match): """Extract the alarm name from the utterance. Args: pattern_match: a regular expression match object """ extract = None try: extract = pattern_match.group(self.group_name).strip() except IndexError: pass else: if not extract: extract = None return extract def _log_extraction_result(self, extract: str): """Log the results of the matching. Args: extract: the value extracted from the user utterance """ if extract is None: LOG.info(f"No {self.group_name.lower()} extracted from utterance") else: LOG.info(f"{self.group_name} extracted from utterance: " + extract) def to_alnum(skill_id): """Convert a skill id to only alphanumeric characters Non alpha-numeric characters are converted to "_" Args: skill_id (str): identifier to be converted Returns: (str) String of letters """ return "".join(c if c.isalnum() else "_" for c in str(skill_id)) def munge_regex(regex, skill_id): """Insert skill id as letters into match groups. Args: regex (str): regex string skill_id (str): skill identifier Returns: (str) munged regex """ base = "(?P<" + to_alnum(skill_id) return base.join(regex.split("(?P<")) def munge_intent_parser(intent_parser, name, skill_id): """Rename intent keywords to make them skill exclusive This gives the intent parser an exclusive name in the format :. The keywords are given unique names in the format . The function will not munge instances that's already been munged Args: intent_parser: (IntentParser) object to update name: (str) Skill name skill_id: (int) skill identifier """ # Munge parser name if not name.startswith(str(skill_id) + ":"): intent_parser.name = str(skill_id) + ":" + name else: intent_parser.name = name # Munge keywords skill_id = to_alnum(skill_id) # Munge required keyword reqs = [] for i in intent_parser.requires: if not i[0].startswith(skill_id): kw = (skill_id + i[0], skill_id + i[0]) reqs.append(kw) else: reqs.append(i) intent_parser.requires = reqs # Munge optional keywords opts = [] for i in intent_parser.optional: if not i[0].startswith(skill_id): kw = (skill_id + i[0], skill_id + i[0]) opts.append(kw) else: opts.append(i) intent_parser.optional = opts # Munge at_least_one keywords at_least_one = [] for i in intent_parser.at_least_one: element = [skill_id + e.replace(skill_id, "") for e in i] at_least_one.append(tuple(element)) intent_parser.at_least_one = at_least_one # Munge excluded keywords excludes = [] for i in intent_parser.excludes: if not i.startswith(skill_id): kw = skill_id + i excludes.append(kw) else: excludes.append(i) intent_parser.excludes = excludes