mycroft-core/mycroft/util/lang/parse_fr.py

# -*- coding: utf-8 -*-
#
# Copyright 2017 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
""" Parse functions for french (fr)

    Todo:
        * extractnumber_fr: ordinal numbers ("cinquième")
        * extractnumber_fr: numbers greater than 999 999 ("cinq millions")
        * extract_datetime_fr: "quatrième lundi de janvier"
        * get_gender_fr
"""

from datetime import datetime
from dateutil.relativedelta import relativedelta
from mycroft.util.lang.parse_common import is_numeric, look_for_fractions

# Undefined articles ["un", "une"] cannot be supressed,
# in French, "un cheval" means "a horse" or "one horse".
articles_fr = ["le", "la", "du", "de", "les", "des"]

numbers_fr = {
    "zéro": 0,
    "un": 1,
    "une": 1,
    "deux": 2,
    "trois": 3,
    "quatre": 4,
    "cinq": 5,
    "six": 6,
    "sept": 7,
    "huit": 8,
    "neuf": 9,
    "dix": 10,
    "onze": 11,
    "douze": 12,
    "treize": 13,
    "quatorze": 14,
    "quinze": 15,
    "seize": 16,
    "vingt": 20,
    "trente": 30,
    "quarante": 40,
    "cinquante": 50,
    "soixante": 60,
    "soixante-dix": 70,
    "septante": 70,
    "quatre-vingt": 80,
    "quatre-vingts": 80,
    "octante": 80,
    "huitante": 80,
    "quatre-vingt-dix": 90,
    "nonante": 90,
    "cent": 100,
    "cents": 100,
    "mille": 1000,
    "mil": 1000,
    "millier": 1000,
    "milliers": 1000,
    "million": 1000000,
    "millions": 1000000,
    "milliard": 1000000000,
    "milliards": 1000000000}

ordinals_fr = ("er", "re", "ère", "nd", "nde" "ième", "ème", "e")


def number_parse_fr(words, i):
    """ Parses a list of words to find a number
    Takes in a list of words (strings without whitespace) and
    extracts a number that starts at the given index.
    Args:
        words (array): the list to extract a number from
        i (int): the index in words where to look for the number
    Returns:
        tuple with number, index of next word after the number.

        Returns None if no number was found.
    """
    def cte_fr(i, s):
        # Check if string s is equal to words[i].
        # If it is return tuple with s, index of next word.
        # If it is not return None.
        if i < len(words) and s == words[i]:
            return s, i + 1
        return None

    def number_word_fr(i, mi, ma):
        # Check if words[i] is a number in numbers_fr between mi and ma.
        # If it is return tuple with number, index of next word.
        # If it is not return None.
        if i < len(words):
            val = numbers_fr.get(words[i])
            # Numbers [1-16,20,30,40,50,60,70,80,90,100,1000]
            if val is not None:
                if val >= mi and val <= ma:
                    return val, i + 1
                else:
                    return None
            # The number may be hyphenated (numbers [17-999])
            splitWord = words[i].split('-')
            if len(splitWord) > 1:
                val1 = numbers_fr.get(splitWord[0])
                if val1:
                    i1 = 0
                    val2 = 0
                    val3 = 0
                    if val1 < 10 and splitWord[1] == "cents":
                        val1 = val1 * 100
                        i1 = 2

                    # For [81-99], e.g. "quatre-vingt-deux"
                    if len(splitWord) > i1 and splitWord[0] == "quatre" and \
                            splitWord[1] == "vingt":
                        val1 = 80
                        i1 += 2

                    # We still found a number
                    if i1 == 0:
                        i1 = 1

                    if len(splitWord) > i1:
                        # For [21,31,41,51,61,71]
                        if len(splitWord) > i1 + 1 and splitWord[i1] == "et":
                            val2 = numbers_fr.get(splitWord[i1 + 1])
                            if val2 is not None:
                                i1 += 2
                        # For [77-79],[97-99] e.g. "soixante-dix-sept"
                        elif splitWord[i1] == "dix" and \
                                len(splitWord) > i1 + 1:
                            val2 = numbers_fr.get(splitWord[i1 + 1])
                            if val2 is not None:
                                val2 += 10
                                i1 += 2
                        else:
                            val2 = numbers_fr.get(splitWord[i1])
                            if val2 is not None:
                                i1 += 1
                                if len(splitWord) > i1:
                                    val3 = numbers_fr.get(splitWord[i1])
                                    if val3 is not None:
                                        i1 += 1

                        if val2:
                            if val3:
                                val = val1 + val2 + val3
                            else:
                                val = val1 + val2
                        else:
                            return None
                    if i1 == len(splitWord) and val and ma >= val >= mi:
                        return val, i + 1

        return None

    def number_1_99_fr(i):
        # Check if words[i] is a number between 1 and 99.
        # If it is return tuple with number, index of next word.
        # If it is not return None.

        # Is it a number between 1 and 16?
        result1 = number_word_fr(i, 1, 16)
        if result1:
            return result1

        # Is it a number between 10 and 99?
        result1 = number_word_fr(i, 10, 99)
        if result1:
            val1, i1 = result1
            result2 = cte_fr(i1, "et")
            # If the number is not hyphenated [21,31,41,51,61,71]
            if result2:
                i2 = result2[1]
                result3 = number_word_fr(i2, 1, 11)
                if result3:
                    val3, i3 = result3
                    return val1 + val3, i3
            return result1

        # It is not a number
        return None

    def number_1_999_fr(i):
        # Check if words[i] is a number between 1 and 999.
        # If it is return tuple with number, index of next word.
        # If it is not return None.

        # Is it 100 ?
        result = number_word_fr(i, 100, 100)

        # Is it [200,300,400,500,600,700,800,900]?
        if not result:
            resultH1 = number_word_fr(i, 2, 9)
            if resultH1:
                valH1, iH1 = resultH1
                resultH2 = number_word_fr(iH1, 100, 100)
                if resultH2:
                    iH2 = resultH2[1]
                    result = valH1 * 100, iH2

        if result:
            val1, i1 = result
            result2 = number_1_99_fr(i1)
            if result2:
                val2, i2 = result2
                return val1 + val2, i2
            else:
                return result

        # Is it hyphenated? [101-999]
        result = number_word_fr(i, 101, 999)
        if result:
            return result

        # [1-99]
        result = number_1_99_fr(i)
        if result:
            return result

        return None

    def number_1_999999_fr(i):
        """ Find a number in a list of words
        Checks if words[i] is a number between 1 and 999,999.

        Args:
            i (int): the index in words where to look for the number
        Returns:
            tuple with number, index of next word after the number.

            Returns None if no number was found.
        """

        # check for zero
        result1 = number_word_fr(i, 0, 0)
        if result1:
            return result1

        # check for [1-999]
        result1 = number_1_999_fr(i)
        if result1:
            val1, i1 = result1
        else:
            val1 = 1
            i1 = i
        # check for 1000
        result2 = number_word_fr(i1, 1000, 1000)
        if result2:
            # it's [1000-999000]
            i2 = result2[1]
            # check again for [1-999]
            result3 = number_1_999_fr(i2)
            if result3:
                val3, i3 = result3
                return val1 * 1000 + val3, i3
            else:
                return val1 * 1000, i2
        elif result1:
            return result1
        return None

    return number_1_999999_fr(i)


def getOrdinal_fr(word):
    """ Get the ordinal number
    Takes in a word (string without whitespace) and
    extracts the ordinal number.
    Args:
        word (string): the word to extract the number from
    Returns:
        number (int)

        Returns None if no ordinal number was found.
    """
    if word:
        for ordinal in ordinals_fr:
            if word[0].isdigit() and ordinal in word:
                result = word.replace(ordinal, "")
                if result.isdigit():
                    return int(result)

    return None


def number_ordinal_fr(words, i):
    """ Find an ordinal number in a list of words
    Takes in a list of words (strings without whitespace) and
    extracts an ordinal number that starts at the given index.
    Args:
        words (array): the list to extract a number from
        i (int): the index in words where to look for the ordinal number
    Returns:
        tuple with ordinal number (str),
        index of next word after the number (int).

        Returns None if no ordinal number was found.
    """
    val1 = None
    strOrd = ""
    # it's already a digit, normalize to "1er" or "5e"
    val1 = getOrdinal_fr(words[i])
    if val1 is not None:
        if val1 == 1:
            strOrd = "1er"
        else:
            strOrd = str(val1) + "e"
        return strOrd, i + 1

    # if it's a big number the beginning should be detected as a number
    result = number_parse_fr(words, i)
    if result:
        val1, i = result
    else:
        val1 = 0

    if i < len(words):
        word = words[i]
        if word in ["premier", "première"]:
            strOrd = "1er"
        elif word == "second":
            strOrd = "2e"
        elif word.endswith("ième"):
            val2 = None
            word = word[:-4]
            # centième
            if word == "cent":
                if val1:
                    strOrd = str(val1 * 100) + "e"
                else:
                    strOrd = "100e"
            # millième
            elif word == "mill":
                if val1:
                    strOrd = str(val1 * 1000) + "e"
                else:
                    strOrd = "1000e"
            else:
                # "cinquième", "trente-cinquième"
                if word.endswith("cinqu"):
                    word = word[:-1]
                # "neuvième", "dix-neuvième"
                elif word.endswith("neuv"):
                    word = word[:-1] + "f"
                result = number_parse_fr([word], 0)
                if not result:
                    # "trentième", "douzième"
                    word = word + "e"
                    result = number_parse_fr([word], 0)
                if result:
                    val2, i = result
                if val2 is not None:
                    strOrd = str(val1 + val2) + "e"
        if strOrd:
            return strOrd, i + 1

    return None


def extractnumber_fr(text):
    """Takes in a string and extracts a number.
    Args:
        text (str): the string to extract a number from
    Returns:
        (str): The number extracted or the original text.
    """
    # normalize text, keep articles for ordinals versus fractionals
    text = normalize_fr(text, False)
    # split words by whitespace
    aWords = text.split()
    count = 0
    result = None
    add = False
    while count < len(aWords):
        val = None
        word = aWords[count]
        wordNext = ""
        wordPrev = ""
        if count < (len(aWords) - 1):
            wordNext = aWords[count + 1]
        if count > 0:
            wordPrev = aWords[count - 1]

        if word in articles_fr:
            count += 1
            continue
        if word in ["et", "plus", "+"]:
            count += 1
            add = True
            continue

        # is current word a numeric number?
        if word.isdigit():
            val = int(word)
            count += 1
        elif is_numeric(word):
            val = float(word)
            count += 1
        elif wordPrev in articles_fr and getOrdinal_fr(word):
            val = getOrdinal_fr(word)
            count += 1
        # is current word the denominator of a fraction?
        elif isFractional_fr(word):
            val = isFractional_fr(word)
            count += 1

        # is current word the numerator of a fraction?
        if val and wordNext:
            valNext = isFractional_fr(wordNext)
            if valNext:
                val = float(val) * valNext
                count += 1

        if not val:
            count += 1
            # is current word a numeric fraction like "2/3"?
            aPieces = word.split('/')
            # if (len(aPieces) == 2 and is_numeric(aPieces[0])
            #   and is_numeric(aPieces[1])):
            if look_for_fractions(aPieces):
                val = float(aPieces[0]) / float(aPieces[1])

        # is current word followed by a decimal value?
        if wordNext == "virgule":
            zeros = 0
            newWords = aWords[count + 1:]
            # count the number of zeros after the decimal sign
            for word in newWords:
                if word == "zéro" or word == "0":
                    zeros += 1
                else:
                    break
            afterDotVal = None
            # extract the number after the zeros
            if newWords[zeros].isdigit():
                afterDotVal = newWords[zeros]
                countDot = count + zeros + 2
            # if a number was extracted (since comma is also a
            # punctuation sign)
            if afterDotVal:
                count = countDot
                if not val:
                    val = 0
                # add the zeros
                afterDotString = zeros * "0" + afterDotVal
                val = float(str(val) + "." + afterDotString)
        if val:
            if add:
                result += val
                add = False
            else:
                result = val

    # if result == False:
    if not result:
        return normalize_fr(text, True)

    return result


def extract_datetime_fr(string, currentDate=None):
    def clean_string(s):
        """
            cleans the input string of unneeded punctuation and capitalization
            among other things.
        """
        s = normalize_fr(s, True)
        wordList = s.split()
        for idx, word in enumerate(wordList):
            # remove comma and dot if it's not a number
            if word[-1] in [",", "."]:
                word = word[:-1]
            wordList[idx] = word

        return wordList

    def date_found():
        return found or \
            (
                datestr != "" or
                yearOffset != 0 or monthOffset != 0 or dayOffset or
                (isTime and (hrAbs != 0 or minAbs != 0)) or
                hrOffset != 0 or minOffset != 0 or secOffset != 0
            )

    if string == "":
        return None
    if currentDate is None:
        currentDate = datetime.now()

    found = False
    daySpecified = False
    dayOffset = False
    monthOffset = 0
    yearOffset = 0
    dateNow = currentDate
    today = dateNow.strftime("%w")
    currentYear = dateNow.strftime("%Y")
    fromFlag = False
    datestr = ""
    hasYear = False
    timeQualifier = ""

    timeQualifiersList = ["matin", "après-midi", "soir", "nuit"]
    words_in = ["dans", "après"]
    markers = ["à", "dès", "autour", "vers", "environs", "ce", "cette"] + \
        words_in
    days = ["lundi", "mardi", "mercredi",
            "jeudi", "vendredi", "samedi", "dimanche"]
    months = ["janvier", "février", "mars", "avril", "mai", "juin",
              "juillet", "août", "septembre", "octobre", "novembre",
              "décembre"]
    monthsShort = ["jan", "fév", "mar", "avr", "mai", "juin", "juil", "aoû",
                   "sept", "oct", "nov", "déc"]
    # needed for format functions
    months_en = ['january', 'february', 'march', 'april', 'may', 'june',
                 'july', 'august', 'september', 'october', 'november',
                 'december']

    words = clean_string(string)

    for idx, word in enumerate(words):
        if word == "":
            continue
        wordPrevPrevPrev = words[idx - 3] if idx > 2 else ""
        wordPrevPrev = words[idx - 2] if idx > 1 else ""
        wordPrev = words[idx - 1] if idx > 0 else ""
        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""

        start = idx
        used = 0
        # save timequalifier for later
        if word in timeQualifiersList:
            timeQualifier = word
            used = 1
            if wordPrev in ["ce", "cet", "cette"]:
                used = 2
                start -= 1
        # parse aujourd'hui, demain, après-demain
        elif word == "aujourd'hui" and not fromFlag:
            dayOffset = 0
            used += 1
        elif word == "demain" and not fromFlag:
            dayOffset = 1
            used += 1
        elif word == "après-demain" and not fromFlag:
            dayOffset = 2
            used += 1
        # parse 5 jours, 10 semaines, semaine dernière, semaine prochaine
        elif word in ["jour", "jours"]:
            if wordPrev.isdigit():
                dayOffset += int(wordPrev)
                start -= 1
                used = 2
            # "3e jour"
            elif getOrdinal_fr(wordPrev) is not None:
                dayOffset += getOrdinal_fr(wordPrev) - 1
                start -= 1
                used = 2
        elif word in ["semaine", "semaines"] and not fromFlag:
            if wordPrev[0].isdigit():
                dayOffset += int(wordPrev) * 7
                start -= 1
                used = 2
            elif wordNext in ["prochaine", "suivante"]:
                dayOffset = 7
                used = 2
            elif wordNext in ["dernière", "précédente"]:
                dayOffset = -7
                used = 2
        # parse 10 mois, mois prochain, mois dernier
        elif word == "mois" and not fromFlag:
            if wordPrev[0].isdigit():
                monthOffset = int(wordPrev)
                start -= 1
                used = 2
            elif wordNext in ["prochain", "suivant"]:
                monthOffset = 1
                used = 2
            elif wordNext in ["dernier", "précédent"]:
                monthOffset = -1
                used = 2
        # parse 5 ans, an prochain, année dernière
        elif word in ["an", "ans", "année", "années"] and not fromFlag:
            if wordPrev[0].isdigit():
                yearOffset = int(wordPrev)
                start -= 1
                used = 2
            elif wordNext in ["prochain", "prochaine", "suivant", "suivante"]:
                yearOffset = 1
                used = 2
            elif wordNext in ["dernier", "dernière", "précédent",
                              "précédente"]:
                yearOffset = -1
                used = 2
        # parse lundi, mardi etc., and lundi prochain, mardi dernier, etc.
        elif word in days and not fromFlag:
            d = days.index(word)
            dayOffset = (d + 1) - int(today)
            used = 1
            if dayOffset < 0:
                dayOffset += 7
            if wordNext in ["prochain", "suivant"]:
                dayOffset += 7
                used += 1
            elif wordNext in ["dernier", "précédent"]:
                dayOffset -= 7
                used += 1
        # parse 15 juillet, 15 juil
        elif word in months or word in monthsShort and not fromFlag:
            try:
                m = months.index(word)
            except ValueError:
                m = monthsShort.index(word)
            used += 1
            datestr = months_en[m]
            if wordPrev and (wordPrev[0].isdigit()):
                datestr += " " + wordPrev
                start -= 1
                used += 1
            else:
                datestr += " 1"
            if wordNext and wordNext[0].isdigit():
                datestr += " " + wordNext
                used += 1
                hasYear = True
            else:
                hasYear = False
        # parse 5 jours après demain, 10 semaines après jeudi prochain,
        # 2 mois après juillet
        validFollowups = days + months + monthsShort
        validFollowups.append("aujourd'hui")
        validFollowups.append("demain")
        validFollowups.append("prochain")
        validFollowups.append("prochaine")
        validFollowups.append("suivant")
        validFollowups.append("suivante")
        validFollowups.append("dernier")
        validFollowups.append("dernière")
        validFollowups.append("précédent")
        validFollowups.append("précédente")
        validFollowups.append("maintenant")
        if word in ["après", "depuis"] and wordNext in validFollowups:
            used = 2
            fromFlag = True
            if wordNext == "demain":
                dayOffset += 1
            elif wordNext in days:
                d = days.index(wordNext)
                tmpOffset = (d + 1) - int(today)
                used = 2
                if wordNextNext == "prochain":
                    tmpOffset += 7
                    used += 1
                elif wordNextNext == "dernier":
                    tmpOffset -= 7
                    used += 1
                elif tmpOffset < 0:
                    tmpOffset += 7
                dayOffset += tmpOffset
        if used > 0:
            if start - 1 > 0 and words[start - 1] in ["ce", "cette"]:
                start -= 1
                used += 1

            for i in range(0, used):
                words[i + start] = ""

            if start - 1 >= 0 and words[start - 1] in markers:
                words[start - 1] = ""
            found = True
            daySpecified = True

    # parse time
    hrOffset = 0
    minOffset = 0
    secOffset = 0
    hrAbs = 0
    minAbs = 0
    ampm = ""
    isTime = False

    for idx, word in enumerate(words):
        if word == "":
            continue

        wordPrevPrev = words[idx - 2] if idx > 1 else ""
        wordPrev = words[idx - 1] if idx > 0 else ""
        wordNext = words[idx + 1] if idx + 1 < len(words) else ""
        wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
        used = 0
        start = idx

        # parse midi et quart, minuit et demi, midi 10, minuit moins 20
        if word in ["midi", "minuit"]:
            isTime = True
            if word == "midi":
                hrAbs = 12
                used += 1
            elif word == "minuit":
                hrAbs = 0
                used += 1
            if wordNext.isdigit():
                minAbs = int(wordNext)
                used += 1
            elif wordNext == "et":
                if wordNextNext == "quart":
                    minAbs = 15
                    used += 2
                elif wordNextNext == "demi":
                    minAbs = 30
                    used += 2
            elif wordNext == "moins":
                if wordNextNext.isdigit():
                    minAbs = 60 - int(wordNextNext)
                    if hrAbs == 0:
                        hrAbs = 23
                    else:
                        hrAbs -= 1
                    used += 2
                if wordNextNext == "quart":
                    minAbs = 45
                    if hrAbs == 0:
                        hrAbs = 23
                    else:
                        hrAbs -= 1
                    used += 2
        # parse une demi-heure, un quart d'heure
        elif word == "demi-heure" or word == "heure" and \
                (wordPrevPrev in markers or wordPrevPrevPrev in markers):
            used = 1
            isTime = True
            if word == "demi-heure":
                minOffset = 30
            elif wordPrev == "quart":
                minOffset = 15
                used += 1
                start -= 1
            elif wordPrev == "quarts" and wordPrevPrev.isdigit():
                minOffset = int(wordPrevPrev) * 15
                used += 1
                start -= 1
            if wordPrev.isdigit() or wordPrevPrev.isdigit():
                start -= 1
                used += 1
        # parse 5:00 du matin, 12:00, etc
        elif word[0].isdigit() and getOrdinal_fr(word) is None:
            isTime = True
            if ":" in word or "h" in word or "min" in word:
                # parse hours on short format
                # "3:00 du matin", "4h14", "3h15min"
                strHH = ""
                strMM = ""
                stage = 0
                length = len(word)
                for i in range(length):
                    if stage == 0:
                        if word[i].isdigit():
                            strHH += word[i]
                            used = 1
                        elif word[i] in [":", "h", "m"]:
                            stage = 1
                        else:
                            stage = 2
                            i -= 1
                    elif stage == 1:
                        if word[i].isdigit():
                            strMM += word[i]
                            used = 1
                        else:
                            stage = 2
                            if word[i:i+3] == "min":
                                i += 1
                    elif stage == 2:
                        break
                if wordPrev in words_in:
                    hrOffset = int(strHH) if strHH else 0
                    minOffset = int(strMM) if strMM else 0
                else:
                    hrAbs = int(strHH) if strHH else 0
                    minAbs = int(strMM) if strMM else 0
            else:
                # try to parse time without colons
                # 5 hours, 10 minutes etc.
                length = len(word)
                ampm = ""
                if (
                        word.isdigit() and
                        wordNext in ["heures", "heure"] and word != "0" and
                        (
                            int(word) < 100 or
                            int(word) > 2400
                        )):
                    # "dans 3 heures", "à 3 heures"
                    if wordPrev in words_in:
                        hrOffset = int(word)
                    else:
                        hrAbs = int(word)
                    used = 2
                    idxHr = idx + 2
                    # "dans 1 heure 40", "à 1 heure 40"
                    if idxHr < len(words):
                        # "3 heures 45"
                        if words[idxHr].isdigit():
                            if wordPrev in words_in:
                                minOffset = int(words[idxHr])
                            else:
                                minAbs = int(words[idxHr])
                            used += 1
                            idxHr += 1
                        # "3 heures et quart", "4 heures et demi"
                        elif words[idxHr] == "et" and idxHr + 1 < len(words):
                            if words[idxHr + 1] == "quart":
                                if wordPrev in words_in:
                                    minOffset = 15
                                else:
                                    minAbs = 15
                                used += 2
                                idxHr += 2
                            elif words[idxHr + 1] == "demi":
                                if wordPrev in words_in:
                                    minOffset = 30
                                else:
                                    minAbs = 30
                                used += 2
                                idxHr += 2
                        # "5 heures moins 20", "6 heures moins le quart"
                        elif words[idxHr] == "moins" and \
                                idxHr + 1 < len(words):
                            if words[idxHr + 1].isdigit():
                                if wordPrev in words_in:
                                    hrOffset -= 1
                                    minOffset = 60 - int(words[idxHr + 1])
                                else:
                                    hrAbs = hrAbs - 1
                                    minAbs = 60 - int(words[idxHr + 1])
                                used += 2
                                idxHr += 2
                            elif words[idxHr + 1] == "quart":
                                if wordPrev in words_in:
                                    hrOffset -= 1
                                    minOffset = 45
                                else:
                                    hrAbs = hrAbs - 1
                                    minAbs = 45
                                used += 2
                                idxHr += 2
                        # remove word minutes if present
                        if idxHr < len(words) and \
                                words[idxHr] in ["minutes", "minute"]:
                            used += 1
                            idxHr += 1
                elif wordNext == "minutes":
                    # "dans 10 minutes"
                    if wordPrev in words_in:
                        minOffset = int(word)
                    else:
                        minAbs = int(word)
                    used = 2
                elif wordNext == "secondes":
                    # "dans 5 secondes"
                    secOffset = int(word)
                    used = 2
                elif int(word) > 100:
                    # format militaire
                    hrAbs = int(word) / 100
                    minAbs = int(word) - hrAbs * 100
                    used = 1
                    if wordNext == "heures":
                        used += 1

            # handle am/pm
            if timeQualifier:
                if timeQualifier == "matin":
                    ampm = "am"
                elif timeQualifier == "après-midi":
                    ampm = "pm"
                elif timeQualifier == "soir":
                    ampm = "pm"
                elif timeQualifier == "nuit":
                    if hrAbs > 8:
                        ampm = "pm"
                    else:
                        ampm = "am"
            hrAbs = hrAbs + 12 if ampm == "pm" and hrAbs < 12 else hrAbs
            hrAbs = hrAbs - 12 if ampm == "am" and hrAbs >= 12 else hrAbs
            if hrAbs > 24 or minAbs > 59:
                isTime = False
                used = 0
            elif wordPrev in words_in:
                isTime = False
            else:
                isTime = True

        elif hrAbs == 0 and timeQualifier:
            if timeQualifier == "matin":
                hrAbs = 8
            elif timeQualifier == "après-midi":
                hrAbs = 15
            elif timeQualifier == "soir":
                hrAbs = 19
            elif timeQualifier == "nuit":
                hrAbs = 2
            isTime = True

        if used > 0:
            # removed parsed words from the sentence
            for i in range(0, used):
                words[i + start] = ""

            if start - 1 >= 0 and words[start - 1] in markers:
                words[start - 1] = ""

            idx += used - 1
            found = True

    # check that we found a date
    if not date_found():
        return None

    if dayOffset is False:
        dayOffset = 0

    # perform date manipulation
    extractedDate = dateNow
    extractedDate = extractedDate.replace(microsecond=0,
                                          second=0,
                                          minute=0,
                                          hour=0)
    if datestr != "":
        if not hasYear:
            temp = datetime.strptime(datestr, "%B %d")
            temp = temp.replace(year=extractedDate.year)
            if extractedDate < temp:
                extractedDate = extractedDate.replace(year=int(currentYear),
                                                      month=int(
                                                          temp.strftime(
                                                              "%m")),
                                                      day=int(temp.strftime(
                                                          "%d")))
            else:
                extractedDate = extractedDate.replace(
                    year=int(currentYear) + 1,
                    month=int(temp.strftime("%m")),
                    day=int(temp.strftime("%d")))
        else:
            temp = datetime.strptime(datestr, "%B %d %Y")
            extractedDate = extractedDate.replace(
                year=int(temp.strftime("%Y")),
                month=int(temp.strftime("%m")),
                day=int(temp.strftime("%d")))

    if yearOffset != 0:
        extractedDate = extractedDate + relativedelta(years=yearOffset)
    if monthOffset != 0:
        extractedDate = extractedDate + relativedelta(months=monthOffset)
    if dayOffset != 0:
        extractedDate = extractedDate + relativedelta(days=dayOffset)
    if hrAbs != -1 and minAbs != -1:

        extractedDate = extractedDate + relativedelta(hours=hrAbs,
                                                      minutes=minAbs)
        if (hrAbs != 0 or minAbs != 0) and datestr == "":
            if not daySpecified and dateNow > extractedDate:
                extractedDate = extractedDate + relativedelta(days=1)
    if hrOffset != 0:
        extractedDate = extractedDate + relativedelta(hours=hrOffset)
    if minOffset != 0:
        extractedDate = extractedDate + relativedelta(minutes=minOffset)
    if secOffset != 0:
        extractedDate = extractedDate + relativedelta(seconds=secOffset)
    for idx, word in enumerate(words):
        if words[idx] == "et" and words[idx - 1] == "" and words[
                idx + 1] == "":
            words[idx] = ""

    resultStr = " ".join(words)
    resultStr = ' '.join(resultStr.split())
    return [extractedDate, resultStr]


def isFractional_fr(input_str):
    """
    This function takes the given text and checks if it is a fraction.
    Args:
        input_str (str): the string to check if fractional
    Returns:
        (bool) or (float): False if not a fraction, otherwise the fraction
    """
    input_str = input_str.lower()

    if input_str != "tiers" and input_str.endswith('s', -1):
        input_str = input_str[:len(input_str) - 1]  # e.g. "quarts"

    aFrac = ["entier", "demi", "tiers", "quart", "cinquième", "sixième",
             "septième", "huitième", "neuvième", "dixième", "onzième",
             "douzième", "treizième", "quatorzième", "quinzième", "seizième",
             "dix-septième", "dix-huitième", "dix-neuvième", "vingtième"]

    if input_str in aFrac:
        return 1.0 / (aFrac.index(input_str) + 1)
    if getOrdinal_fr(input_str):
        return 1.0 / getOrdinal_fr(input_str)
    if input_str == "trentième":
        return 1.0 / 30
    if input_str == "centième":
        return 1.0 / 100
    if input_str == "millième":
        return 1.0 / 1000

    return False


def normalize_fr(text, remove_articles):
    """ French string normalization """
    text = text.lower()
    words = text.split()  # this also removed extra spaces
    normalized = ""
    i = 0
    while i < len(words):
        # remove articles
        if remove_articles and words[i] in articles_fr:
            i += 1
            continue
        if remove_articles and words[i][:2] in ["l'", "d'"]:
            words[i] = words[i][2:]
        # remove useless punctuation signs
        if words[i] in ["?", "!", ";", "…"]:
            i += 1
            continue
        # Normalize ordinal numbers
        if i > 0 and words[i - 1] in articles_fr:
            result = number_ordinal_fr(words, i)
            if result is not None:
                val, i = result
                normalized += " " + str(val)
                continue
        # Convert numbers into digits
        result = number_parse_fr(words, i)
        if result is not None:
            val, i = result
            normalized += " " + str(val)
            continue

        normalized += " " + words[i]
        i += 1

    return normalized[1:]  # strip the initial space