mycroft-core/mycroft/skills/skill_data.py

# Copyright 2018 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Handling of skill data such as intents and regular expressions."""
import re
from collections import namedtuple

from os import walk
from pathlib import Path
from typing import List, Optional, Tuple

from mycroft.util.file_utils import resolve_resource_file
from mycroft.util.format import expand_options
from mycroft.util.log import LOG

SkillResourceTypes = namedtuple(
    "SkillResourceTypes",
    [
        "dialog",
        "entity",
        "intent",
        "list",
        "named_value",
        "regex",
        "template",
        "vocabulary",
        "word",
    ],
)


class ResourceType:
    """Defines the attributes of a type of skill resource.

    Examples:
        dialog = ResourceType("dialog", ".dialog")
        dialog.locate_base_directory(self.root_dir, self.lang)

        named_value = ResourceType("named_value", ".value")
        named_value.locate_base_directory(self.root_dir, self.lang)

    Attributes:
        resource_type: one of a predefined set of resource types for skills
        file_extension: the file extension associated with the resource type
        base_directory: directory containing all files for the resource type
    """

    def __init__(self, resource_type: str, file_extension: str, language: str):
        self.resource_type = resource_type
        self.file_extension = file_extension
        self.language = language
        self.base_directory = None

    def locate_base_directory(self, skill_directory):
        """Find the skill's base directory for the specified resource type.

        There are three supported methodologies for storing resource files.
        The preferred method is to use the "locale" directory but older methods
        are included in the search for backwards compatibility.  The three
        directory schemes are:
           <skill>/locale/<lang>/.../<resource_type>
           <skill>/<resource_subdirectory>/<lang>/
           <skill>/<resource_subdirectory>
        If the directory for the specified language doesn't exist, fall back to
        the default "en-us".

        Args:
            skill_directory: the root directory of a skill
        Returns:
            the skill's directory for the resource type or None if not found
        """
        resource_subdirectory = self._get_resource_subdirectory()
        possible_directories = (
            Path(skill_directory, "locale", self.language),
            Path(skill_directory, "locale", "en-us"),
            Path(skill_directory, resource_subdirectory, self.language),
            Path(skill_directory, resource_subdirectory, "en-us"),
            Path(skill_directory, resource_subdirectory),
        )
        for directory in possible_directories:
            if directory.exists():
                self.base_directory = directory
                if "en-us" in str(directory) and self.language != "en-us":
                    self.language = "en-us"
                break

    def _get_resource_subdirectory(self) -> str:
        """Returns the subdirectory for this resource type.

        In the older directory schemes, several resource types were stored
        in the same set of three directories (dialog, regex, vocab).
        """
        subdirectories = dict(
            dialog="dialog",
            entity="vocab",
            intent="vocab",
            list="dialog",
            named_value="dialog",
            regex="regex",
            template="dialog",
            vocab="vocab",
            word="dialog",
        )

        return subdirectories[self.resource_type]


class ResourceFile:
    """Loads a resource file for the user's configured language.

    Attributes:
        resource_type: attributes of the resource type (dialog, vocab, etc.)
        resource_name: file name of the resource, with or without extension
        file_path: absolute path to the file
    """

    def __init__(self, resource_type, resource_name):
        self.resource_type = resource_type
        self.resource_name = resource_name
        self.file_path = self._locate()

    def _locate(self):
        """Locates a resource file in the skill's locale directory.

        A skill's locale directory can contain a subdirectory structure defined
        by the skill author.  Walk the directory and any subdirectories to
        find the resource file.
        """
        file_path = None
        if self.resource_name.endswith(self.resource_type.file_extension):
            file_name = self.resource_name
        else:
            file_name = self.resource_name + self.resource_type.file_extension

        walk_directory = str(self.resource_type.base_directory)
        for directory, _, file_names in walk(walk_directory):
            if file_name in file_names:
                file_path = Path(directory, file_name)

        if file_path is None:
            sub_path = Path("text", self.resource_type.language, file_name)
            file_path = resolve_resource_file(str(sub_path))

        if file_path is None:
            LOG.error(f"Could not find resource file {file_name}")

        return file_path

    def load(self):
        """Override in subclass to define resource type loading behavior."""
        pass

    def _read(self) -> str:
        """Reads the specified file, removing comment and empty lines."""
        with open(self.file_path) as resource_file:
            for line in [line.strip() for line in resource_file.readlines()]:
                if not line or line.startswith("#"):
                    continue
                yield line


class DialogFile(ResourceFile):
    """Defines a dialog file, which is used instruct TTS what to speak."""

    def __init__(self, resource_type, resource_name):
        super().__init__(resource_type, resource_name)
        self.data = None

    def load(self) -> List[str]:
        """Load and lines from a file and populate the variables.

        Returns:
            Contents of the file with variables resolved.
        """
        dialogs = None
        if self.file_path is not None:
            dialogs = []
            for line in self._read():
                line = line.replace("{{", "{").replace("}}", "}")
                if self.data is not None:
                    line = line.format(**self.data)
                dialogs.append(line)

        return dialogs

    def render(self, dialog_renderer):
        """Renders a random phrase from a dialog file.

        If no file is found, the requested phrase is returned as the string. This
        will use the default language for translations.

        Returns:
            str: a randomized version of the phrase
        """
        return dialog_renderer.render(self.resource_name, self.data)


class VocabularyFile(ResourceFile):
    """Defines a vocabulary file, which skill use to form intents."""

    def load(self) -> List[List[str]]:
        """Loads a vocabulary file.

        If a record in a vocabulary file contains sets of words inside
        parentheses, generate a vocabulary item for each permutation within
        the parentheses.

        Returns:
            List of lines in the file.  Each item in the list is a list of
            strings that represent different options based on regular
            expression.
        """
        vocabulary = []
        if self.file_path is not None:
            for line in self._read():
                vocabulary.append(expand_options(line.lower()))

        return vocabulary


class NamedValueFile(ResourceFile):
    """Defines a named value file, which maps a variable to a values."""

    def __init__(self, resource_type, resource_name):
        super().__init__(resource_type, resource_name)
        self.delimiter = ","

    def load(self) -> dict:
        """Load file containing names and values.

        Returns:
            A dictionary representation of the records in the file.
        """
        named_values = dict()
        if self.file_path is not None:
            for line in self._read():
                name, value = self._load_line(line)
                if name is not None and value is not None:
                    named_values[name] = value

        return named_values

    def _load_line(self, line: str) -> Tuple[str, str]:
        """Attempts to split the name and value for dictionary loading.

        Args:
            line: a record in a .value file
        Returns:
            The name/value pair that will be loaded into a dictionary.
        """
        name = None
        value = None
        try:
            name, value = line.split(self.delimiter)
        except ValueError:
            LOG.exception(
                f"Failed to load value file {self.file_path} "
                f"record containing {line}"
            )

        return name, value


class ListFile(DialogFile):
    pass


class TemplateFile(DialogFile):
    pass


class RegexFile(ResourceFile):
    def load(self):
        regex_patterns = []
        if self.file_path:
            regex_patterns = [line for line in self._read()]

        return regex_patterns


class WordFile(ResourceFile):
    """Defines a word file, which defines a word in the configured language."""

    def load(self) -> Optional[str]:
        """Load and lines from a file and populate the variables.

        Returns:
            The word contained in the file
        """
        word = None
        if self.file_path is not None:
            for line in self._read():
                word = line
                break

        return word


class SkillResources:
    def __init__(self, skill_directory, language, dialog_renderer):
        self.skill_directory = skill_directory
        self.language = language
        self.types = self._define_resource_types()
        self.dialog_renderer = dialog_renderer
        self.static = dict()

    def _define_resource_types(self) -> SkillResourceTypes:
        """Defines all known types of skill resource files.

        A resource file contains information the skill needs to function.
        Examples include dialog files to be spoken and vocab files for intent
        matching.
        """
        resource_types = dict(
            dialog=ResourceType("dialog", ".dialog", self.language),
            entity=ResourceType("entity", ".entity", self.language),
            intent=ResourceType("intent", ".intent", self.language),
            list=ResourceType("list", ".list", self.language),
            named_value=ResourceType("named_value", ".value", self.language),
            regex=ResourceType("regex", ".rx", self.language),
            template=ResourceType("template", ".template", self.language),
            vocabulary=ResourceType("vocab", ".voc", self.language),
            word=ResourceType("word", ".word", self.language),
        )
        for resource_type in resource_types.values():
            resource_type.locate_base_directory(self.skill_directory)

        return SkillResourceTypes(**resource_types)

    def load_dialog_file(self, name, data=None) -> List[str]:
        """Loads the contents of a dialog file into memory.

        Named variables in the dialog are populated with values found in the
        data dictionary.

        Args:
            name: name of the dialog file (no extension needed)
            data: keyword arguments used to populate variables
        Returns:
            A list of phrases with variables resolved
        """
        dialog_file = DialogFile(self.types.dialog, name)
        dialog_file.data = data

        return dialog_file.load()

    def load_list_file(self, name, data=None) -> List[str]:
        """Load a file containing a list of words or phrases

        Named variables in the dialog are populated with values found in the
        data dictionary.

        Args:
            name: name of the list file (no extension needed)
            data: keyword arguments used to populate variables
        Returns:
            List of words or phrases read from the list file.
        """
        list_file = ListFile(self.types.list, name)
        list_file.data = data

        return list_file.load()

    def load_named_value_file(self, name, delimiter=None) -> dict:
        """Load file containing a set names and values.

        Loads a simple delimited file of name/value pairs.
        The name is the first item, the value is the second.

        Args:
            name: name of the .value file, no extension needed
            delimiter: delimiter character used
        Returns:
            File contents represented as a dictionary
        """
        if name in self.static:
            named_values = self.static[name]
        else:
            named_value_file = NamedValueFile(self.types.named_value, name)
            if delimiter is not None:
                named_value_file.delimiter = delimiter
            named_values = named_value_file.load()
            self.static[name] = named_values

        return named_values

    def load_regex_file(self, name) -> List[str]:
        """Loads a file containing regular expression patterns.

        The regular expression patterns are generally used to find a value
        in a user utterance the skill needs to properly perform the requested
        function.

        Args:
            name: name of the regular expression file, no extension needed
        Returns:
            List representation of the regular expression file.
        """
        regex_file = RegexFile(self.types.regex, name)

        return regex_file.load()

    def load_template_file(self, name, data=None) -> List[str]:
        """Loads the contents of a dialog file into memory.

        Named variables in the dialog are populated with values found in the
        data dictionary.

        Args:
            name: name of the dialog file (no extension needed)
            data: keyword arguments used to populate variables
        Returns:
            A list of phrases with variables resolved
        """
        template_file = TemplateFile(self.types.template, name)
        template_file.data = data

        return template_file.load()

    def load_vocabulary_file(self, name) -> List[List[str]]:
        """Loads a file containing variations of words meaning the same thing.

        A vocabulary file defines words a skill uses for intent matching.
        It can also be used to match words in an utterance after intent
        intent matching is complete.

        Args:
            name: name of the regular expression file, no extension needed
        Returns:
            List representation of the regular expression file.
        """
        vocabulary_file = VocabularyFile(self.types.vocabulary, name)

        return vocabulary_file.load()

    def load_word_file(self, name) -> Optional[str]:
        """Loads a file containing a word.

        Args:
            name: name of the regular expression file, no extension needed
        Returns:
            List representation of the regular expression file.
        """
        word_file = WordFile(self.types.word, name)

        return word_file.load()

    def render_dialog(self, name, data=None) -> str:
        """Selects a record from a dialog file at random for TTS purposes.

        Args:
            name: name of the list file (no extension needed)
            data: keyword arguments used to populate variables
        Returns:
            Random record from the file with variables resolved.
        """
        resource_file = DialogFile(self.types.dialog, name)
        resource_file.data = data

        return resource_file.render(self.dialog_renderer)

    def load_skill_vocabulary(self, alphanumeric_skill_id: str) -> dict:
        skill_vocabulary = {}
        base_directory = self.types.vocabulary.base_directory
        for directory, _, files in walk(base_directory):
            vocabulary_files = [
                file_name for file_name in files if file_name.endswith(".voc")
            ]
            for file_name in vocabulary_files:
                vocab_type = alphanumeric_skill_id + file_name[:-4].title()
                vocabulary = self.load_vocabulary_file(file_name)
                if vocabulary:
                    skill_vocabulary[vocab_type] = vocabulary

        return skill_vocabulary

    def load_skill_regex(self, alphanumeric_skill_id: str) -> List[str]:
        skill_regexes = []
        base_directory = self.types.regex.base_directory
        for directory, _, files in walk(base_directory):
            regex_files = [
                file_name for file_name in files if file_name.endswith(".rx")
            ]
            for file_name in regex_files:
                skill_regexes.extend(self.load_regex_file(file_name))

        skill_regexes = self._make_unique_regex_group(
            skill_regexes, alphanumeric_skill_id
        )

        return skill_regexes

    @staticmethod
    def _make_unique_regex_group(
        regexes: List[str], alphanumeric_skill_id: str
    ) -> List[str]:
        """Adds skill ID to group ID in a regular expression for uniqueness.

        Args:
            regexes: regex string
            alphanumeric_skill_id: skill identifier
        Returns:
            regular expressions with uniquely named group IDs
        Raises:
            re.error if the regex does not compile
        """
        modified_regexes = []
        for regex in regexes:
            base = "(?P<" + alphanumeric_skill_id
            modified_regex = base.join(regex.split("(?P<"))
            re.compile(modified_regex)
            modified_regexes.append(modified_regex)

        return modified_regexes


class RegexExtractor:
    """Extracts data from an utterance using regular expressions.

    Attributes:
        group_name:
        regex_patterns: regular expressions read from a .rx file
    """

    def __init__(self, group_name, regex_patterns):
        self.group_name = group_name
        self.regex_patterns = regex_patterns

    def extract(self, utterance) -> Optional[str]:
        """Attempt to find a value in a user request.

        Args:
            utterance: request spoken by the user

        Returns:
            The value extracted from the utterance, if found
        """
        extract = None
        pattern_match = self._match_utterance_to_patterns(utterance)
        if pattern_match is not None:
            extract = self._extract_group_from_match(pattern_match)
        self._log_extraction_result(extract)

        return extract

    def _match_utterance_to_patterns(self, utterance: str):
        """Match regular expressions to user request.

        Args:
            utterance: request spoken by the user

        Returns:
            a regular expression match object if a match is found
        """
        pattern_match = None
        for pattern in self.regex_patterns:
            pattern_match = re.search(pattern, utterance)
            if pattern_match:
                break

        return pattern_match

    def _extract_group_from_match(self, pattern_match):
        """Extract the alarm name from the utterance.

        Args:
            pattern_match: a regular expression match object
        """
        extract = None
        try:
            extract = pattern_match.group(self.group_name).strip()
        except IndexError:
            pass
        else:
            if not extract:
                extract = None

        return extract

    def _log_extraction_result(self, extract: str):
        """Log the results of the matching.

        Args:
            extract: the value extracted from the user utterance
        """
        if extract is None:
            LOG.info(f"No {self.group_name.lower()} extracted from utterance")
        else:
            LOG.info(f"{self.group_name} extracted from utterance: " + extract)


def to_alnum(skill_id):
    """Convert a skill id to only alphanumeric characters

     Non alpha-numeric characters are converted to "_"

    Args:
        skill_id (str): identifier to be converted
    Returns:
        (str) String of letters
    """
    return "".join(c if c.isalnum() else "_" for c in str(skill_id))


def munge_regex(regex, skill_id):
    """Insert skill id as letters into match groups.
    Args:
        regex (str): regex string
        skill_id (str): skill identifier
    Returns:
        (str) munged regex
    """
    base = "(?P<" + to_alnum(skill_id)
    return base.join(regex.split("(?P<"))


def munge_intent_parser(intent_parser, name, skill_id):
    """Rename intent keywords to make them skill exclusive
    This gives the intent parser an exclusive name in the
    format <skill_id>:<name>.  The keywords are given unique
    names in the format <Skill id as letters><Intent name>.

    The function will not munge instances that's already been
    munged

    Args:
        intent_parser: (IntentParser) object to update
        name: (str) Skill name
        skill_id: (int) skill identifier
    """
    # Munge parser name
    if not name.startswith(str(skill_id) + ":"):
        intent_parser.name = str(skill_id) + ":" + name
    else:
        intent_parser.name = name

    # Munge keywords
    skill_id = to_alnum(skill_id)
    # Munge required keyword
    reqs = []
    for i in intent_parser.requires:
        if not i[0].startswith(skill_id):
            kw = (skill_id + i[0], skill_id + i[0])
            reqs.append(kw)
        else:
            reqs.append(i)
    intent_parser.requires = reqs

    # Munge optional keywords
    opts = []
    for i in intent_parser.optional:
        if not i[0].startswith(skill_id):
            kw = (skill_id + i[0], skill_id + i[0])
            opts.append(kw)
        else:
            opts.append(i)
    intent_parser.optional = opts

    # Munge at_least_one keywords
    at_least_one = []
    for i in intent_parser.at_least_one:
        element = [skill_id + e.replace(skill_id, "") for e in i]
        at_least_one.append(tuple(element))
    intent_parser.at_least_one = at_least_one

    # Munge excluded keywords
    excludes = []
    for i in intent_parser.excludes:
        if not i.startswith(skill_id):
            kw = skill_id + i
            excludes.append(kw)
        else:
            excludes.append(i)
    intent_parser.excludes = excludes