# -*- coding: utf-8 -*- # # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ Parse functions for french (fr) Todo: * extractnumber_fr: ordinal numbers ("cinquième") * extractnumber_fr: numbers greater than 999 999 ("cinq millions") * extract_datetime_fr: "quatrième lundi de janvier" * get_gender_fr """ from datetime import datetime from dateutil.relativedelta import relativedelta from mycroft.util.lang.parse_common import is_numeric, look_for_fractions # Undefined articles ["un", "une"] cannot be supressed, # in French, "un cheval" means "a horse" or "one horse". articles_fr = ["le", "la", "du", "de", "les", "des"] numbers_fr = { "zéro": 0, "un": 1, "une": 1, "deux": 2, "trois": 3, "quatre": 4, "cinq": 5, "six": 6, "sept": 7, "huit": 8, "neuf": 9, "dix": 10, "onze": 11, "douze": 12, "treize": 13, "quatorze": 14, "quinze": 15, "seize": 16, "vingt": 20, "trente": 30, "quarante": 40, "cinquante": 50, "soixante": 60, "soixante-dix": 70, "septante": 70, "quatre-vingt": 80, "quatre-vingts": 80, "octante": 80, "huitante": 80, "quatre-vingt-dix": 90, "nonante": 90, "cent": 100, "cents": 100, "mille": 1000, "mil": 1000, "millier": 1000, "milliers": 1000, "million": 1000000, "millions": 1000000, "milliard": 1000000000, "milliards": 1000000000} ordinals_fr = ("er", "re", "ère", "nd", "nde" "ième", "ème", "e") def number_parse_fr(words, i): """ Parses a list of words to find a number Takes in a list of words (strings without whitespace) and extracts a number that starts at the given index. Args: words (array): the list to extract a number from i (int): the index in words where to look for the number Returns: tuple with number, index of next word after the number. Returns None if no number was found. """ def cte_fr(i, s): # Check if string s is equal to words[i]. # If it is return tuple with s, index of next word. # If it is not return None. if i < len(words) and s == words[i]: return s, i + 1 return None def number_word_fr(i, mi, ma): # Check if words[i] is a number in numbers_fr between mi and ma. # If it is return tuple with number, index of next word. # If it is not return None. if i < len(words): val = numbers_fr.get(words[i]) # Numbers [1-16,20,30,40,50,60,70,80,90,100,1000] if val is not None: if val >= mi and val <= ma: return val, i + 1 else: return None # The number may be hyphenated (numbers [17-999]) splitWord = words[i].split('-') if len(splitWord) > 1: val1 = numbers_fr.get(splitWord[0]) if val1: i1 = 0 val2 = 0 val3 = 0 if val1 < 10 and splitWord[1] == "cents": val1 = val1 * 100 i1 = 2 # For [81-99], e.g. "quatre-vingt-deux" if len(splitWord) > i1 and splitWord[0] == "quatre" and \ splitWord[1] == "vingt": val1 = 80 i1 += 2 # We still found a number if i1 == 0: i1 = 1 if len(splitWord) > i1: # For [21,31,41,51,61,71] if len(splitWord) > i1 + 1 and splitWord[i1] == "et": val2 = numbers_fr.get(splitWord[i1 + 1]) if val2 is not None: i1 += 2 # For [77-79],[97-99] e.g. "soixante-dix-sept" elif splitWord[i1] == "dix" and \ len(splitWord) > i1 + 1: val2 = numbers_fr.get(splitWord[i1 + 1]) if val2 is not None: val2 += 10 i1 += 2 else: val2 = numbers_fr.get(splitWord[i1]) if val2 is not None: i1 += 1 if len(splitWord) > i1: val3 = numbers_fr.get(splitWord[i1]) if val3 is not None: i1 += 1 if val2: if val3: val = val1 + val2 + val3 else: val = val1 + val2 else: return None if i1 == len(splitWord) and val and ma >= val >= mi: return val, i + 1 return None def number_1_99_fr(i): # Check if words[i] is a number between 1 and 99. # If it is return tuple with number, index of next word. # If it is not return None. # Is it a number between 1 and 16? result1 = number_word_fr(i, 1, 16) if result1: return result1 # Is it a number between 10 and 99? result1 = number_word_fr(i, 10, 99) if result1: val1, i1 = result1 result2 = cte_fr(i1, "et") # If the number is not hyphenated [21,31,41,51,61,71] if result2: i2 = result2[1] result3 = number_word_fr(i2, 1, 11) if result3: val3, i3 = result3 return val1 + val3, i3 return result1 # It is not a number return None def number_1_999_fr(i): # Check if words[i] is a number between 1 and 999. # If it is return tuple with number, index of next word. # If it is not return None. # Is it 100 ? result = number_word_fr(i, 100, 100) # Is it [200,300,400,500,600,700,800,900]? if not result: resultH1 = number_word_fr(i, 2, 9) if resultH1: valH1, iH1 = resultH1 resultH2 = number_word_fr(iH1, 100, 100) if resultH2: iH2 = resultH2[1] result = valH1 * 100, iH2 if result: val1, i1 = result result2 = number_1_99_fr(i1) if result2: val2, i2 = result2 return val1 + val2, i2 else: return result # Is it hyphenated? [101-999] result = number_word_fr(i, 101, 999) if result: return result # [1-99] result = number_1_99_fr(i) if result: return result return None def number_1_999999_fr(i): """ Find a number in a list of words Checks if words[i] is a number between 1 and 999,999. Args: i (int): the index in words where to look for the number Returns: tuple with number, index of next word after the number. Returns None if no number was found. """ # check for zero result1 = number_word_fr(i, 0, 0) if result1: return result1 # check for [1-999] result1 = number_1_999_fr(i) if result1: val1, i1 = result1 else: val1 = 1 i1 = i # check for 1000 result2 = number_word_fr(i1, 1000, 1000) if result2: # it's [1000-999000] i2 = result2[1] # check again for [1-999] result3 = number_1_999_fr(i2) if result3: val3, i3 = result3 return val1 * 1000 + val3, i3 else: return val1 * 1000, i2 elif result1: return result1 return None return number_1_999999_fr(i) def getOrdinal_fr(word): """ Get the ordinal number Takes in a word (string without whitespace) and extracts the ordinal number. Args: word (string): the word to extract the number from Returns: number (int) Returns None if no ordinal number was found. """ if word: for ordinal in ordinals_fr: if word[0].isdigit() and ordinal in word: result = word.replace(ordinal, "") if result.isdigit(): return int(result) return None def number_ordinal_fr(words, i): """ Find an ordinal number in a list of words Takes in a list of words (strings without whitespace) and extracts an ordinal number that starts at the given index. Args: words (array): the list to extract a number from i (int): the index in words where to look for the ordinal number Returns: tuple with ordinal number (str), index of next word after the number (int). Returns None if no ordinal number was found. """ val1 = None strOrd = "" # it's already a digit, normalize to "1er" or "5e" val1 = getOrdinal_fr(words[i]) if val1 is not None: if val1 == 1: strOrd = "1er" else: strOrd = str(val1) + "e" return strOrd, i + 1 # if it's a big number the beginning should be detected as a number result = number_parse_fr(words, i) if result: val1, i = result else: val1 = 0 if i < len(words): word = words[i] if word in ["premier", "première"]: strOrd = "1er" elif word == "second": strOrd = "2e" elif word.endswith("ième"): val2 = None word = word[:-4] # centième if word == "cent": if val1: strOrd = str(val1 * 100) + "e" else: strOrd = "100e" # millième elif word == "mill": if val1: strOrd = str(val1 * 1000) + "e" else: strOrd = "1000e" else: # "cinquième", "trente-cinquième" if word.endswith("cinqu"): word = word[:-1] # "neuvième", "dix-neuvième" elif word.endswith("neuv"): word = word[:-1] + "f" result = number_parse_fr([word], 0) if not result: # "trentième", "douzième" word = word + "e" result = number_parse_fr([word], 0) if result: val2, i = result if val2 is not None: strOrd = str(val1 + val2) + "e" if strOrd: return strOrd, i + 1 return None def extractnumber_fr(text): """Takes in a string and extracts a number. Args: text (str): the string to extract a number from Returns: (str): The number extracted or the original text. """ # normalize text, keep articles for ordinals versus fractionals text = normalize_fr(text, False) # split words by whitespace aWords = text.split() count = 0 result = None add = False while count < len(aWords): val = None word = aWords[count] wordNext = "" wordPrev = "" if count < (len(aWords) - 1): wordNext = aWords[count + 1] if count > 0: wordPrev = aWords[count - 1] if word in articles_fr: count += 1 continue if word in ["et", "plus", "+"]: count += 1 add = True continue # is current word a numeric number? if word.isdigit(): val = int(word) count += 1 elif is_numeric(word): val = float(word) count += 1 elif wordPrev in articles_fr and getOrdinal_fr(word): val = getOrdinal_fr(word) count += 1 # is current word the denominator of a fraction? elif isFractional_fr(word): val = isFractional_fr(word) count += 1 # is current word the numerator of a fraction? if val and wordNext: valNext = isFractional_fr(wordNext) if valNext: val = float(val) * valNext count += 1 if not val: count += 1 # is current word a numeric fraction like "2/3"? aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) # is current word followed by a decimal value? if wordNext == "virgule": zeros = 0 newWords = aWords[count + 1:] # count the number of zeros after the decimal sign for word in newWords: if word == "zéro" or word == "0": zeros += 1 else: break afterDotVal = None # extract the number after the zeros if newWords[zeros].isdigit(): afterDotVal = newWords[zeros] countDot = count + zeros + 2 # if a number was extracted (since comma is also a # punctuation sign) if afterDotVal: count = countDot if not val: val = 0 # add the zeros afterDotString = zeros * "0" + afterDotVal val = float(str(val) + "." + afterDotString) if val: if add: result += val add = False else: result = val # if result == False: if not result: return normalize_fr(text, True) return result def extract_datetime_fr(string, currentDate=None): def clean_string(s): """ cleans the input string of unneeded punctuation and capitalization among other things. """ s = normalize_fr(s, True) wordList = s.split() for idx, word in enumerate(wordList): # remove comma and dot if it's not a number if word[-1] in [",", "."]: word = word[:-1] wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset or (isTime and (hrAbs != 0 or minAbs != 0)) or hrOffset != 0 or minOffset != 0 or secOffset != 0 ) if string == "": return None if currentDate is None: currentDate = datetime.now() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = currentDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersList = ["matin", "après-midi", "soir", "nuit"] words_in = ["dans", "après"] markers = ["à", "dès", "autour", "vers", "environs", "ce", "cette"] + \ words_in days = ["lundi", "mardi", "mercredi", "jeudi", "vendredi", "samedi", "dimanche"] months = ["janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"] monthsShort = ["jan", "fév", "mar", "avr", "mai", "juin", "juil", "aoû", "sept", "oct", "nov", "déc"] # needed for format functions months_en = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] words = clean_string(string) for idx, word in enumerate(words): if word == "": continue wordPrevPrevPrev = words[idx - 3] if idx > 2 else "" wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word used = 1 if wordPrev in ["ce", "cet", "cette"]: used = 2 start -= 1 # parse aujourd'hui, demain, après-demain elif word == "aujourd'hui" and not fromFlag: dayOffset = 0 used += 1 elif word == "demain" and not fromFlag: dayOffset = 1 used += 1 elif word == "après-demain" and not fromFlag: dayOffset = 2 used += 1 # parse 5 jours, 10 semaines, semaine dernière, semaine prochaine elif word in ["jour", "jours"]: if wordPrev.isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 # "3e jour" elif getOrdinal_fr(wordPrev) is not None: dayOffset += getOrdinal_fr(wordPrev) - 1 start -= 1 used = 2 elif word in ["semaine", "semaines"] and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordNext in ["prochaine", "suivante"]: dayOffset = 7 used = 2 elif wordNext in ["dernière", "précédente"]: dayOffset = -7 used = 2 # parse 10 mois, mois prochain, mois dernier elif word == "mois" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordNext in ["prochain", "suivant"]: monthOffset = 1 used = 2 elif wordNext in ["dernier", "précédent"]: monthOffset = -1 used = 2 # parse 5 ans, an prochain, année dernière elif word in ["an", "ans", "année", "années"] and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordNext in ["prochain", "prochaine", "suivant", "suivante"]: yearOffset = 1 used = 2 elif wordNext in ["dernier", "dernière", "précédent", "précédente"]: yearOffset = -1 used = 2 # parse lundi, mardi etc., and lundi prochain, mardi dernier, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordNext in ["prochain", "suivant"]: dayOffset += 7 used += 1 elif wordNext in ["dernier", "précédent"]: dayOffset -= 7 used += 1 # parse 15 juillet, 15 juil elif word in months or word in monthsShort and not fromFlag: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months_en[m] if wordPrev and (wordPrev[0].isdigit()): datestr += " " + wordPrev start -= 1 used += 1 else: datestr += " 1" if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False # parse 5 jours après demain, 10 semaines après jeudi prochain, # 2 mois après juillet validFollowups = days + months + monthsShort validFollowups.append("aujourd'hui") validFollowups.append("demain") validFollowups.append("prochain") validFollowups.append("prochaine") validFollowups.append("suivant") validFollowups.append("suivante") validFollowups.append("dernier") validFollowups.append("dernière") validFollowups.append("précédent") validFollowups.append("précédente") validFollowups.append("maintenant") if word in ["après", "depuis"] and wordNext in validFollowups: used = 2 fromFlag = True if wordNext == "demain": dayOffset += 1 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if wordNextNext == "prochain": tmpOffset += 7 used += 1 elif wordNextNext == "dernier": tmpOffset -= 7 used += 1 elif tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and words[start - 1] in ["ce", "cette"]: start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" found = True daySpecified = True # parse time hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = 0 minAbs = 0 ampm = "" isTime = False for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" used = 0 start = idx # parse midi et quart, minuit et demi, midi 10, minuit moins 20 if word in ["midi", "minuit"]: isTime = True if word == "midi": hrAbs = 12 used += 1 elif word == "minuit": hrAbs = 0 used += 1 if wordNext.isdigit(): minAbs = int(wordNext) used += 1 elif wordNext == "et": if wordNextNext == "quart": minAbs = 15 used += 2 elif wordNextNext == "demi": minAbs = 30 used += 2 elif wordNext == "moins": if wordNextNext.isdigit(): minAbs = 60 - int(wordNextNext) if hrAbs == 0: hrAbs = 23 else: hrAbs -= 1 used += 2 if wordNextNext == "quart": minAbs = 45 if hrAbs == 0: hrAbs = 23 else: hrAbs -= 1 used += 2 # parse une demi-heure, un quart d'heure elif word == "demi-heure" or word == "heure" and \ (wordPrevPrev in markers or wordPrevPrevPrev in markers): used = 1 isTime = True if word == "demi-heure": minOffset = 30 elif wordPrev == "quart": minOffset = 15 used += 1 start -= 1 elif wordPrev == "quarts" and wordPrevPrev.isdigit(): minOffset = int(wordPrevPrev) * 15 used += 1 start -= 1 if wordPrev.isdigit() or wordPrevPrev.isdigit(): start -= 1 used += 1 # parse 5:00 du matin, 12:00, etc elif word[0].isdigit() and getOrdinal_fr(word) is None: isTime = True if ":" in word or "h" in word or "min" in word: # parse hours on short format # "3:00 du matin", "4h14", "3h15min" strHH = "" strMM = "" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] used = 1 elif word[i] in [":", "h", "m"]: stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] used = 1 else: stage = 2 if word[i:i+3] == "min": i += 1 elif stage == 2: break if wordPrev in words_in: hrOffset = int(strHH) if strHH else 0 minOffset = int(strMM) if strMM else 0 else: hrAbs = int(strHH) if strHH else 0 minAbs = int(strMM) if strMM else 0 else: # try to parse time without colons # 5 hours, 10 minutes etc. length = len(word) ampm = "" if ( word.isdigit() and wordNext in ["heures", "heure"] and word != "0" and ( int(word) < 100 or int(word) > 2400 )): # "dans 3 heures", "à 3 heures" if wordPrev in words_in: hrOffset = int(word) else: hrAbs = int(word) used = 2 idxHr = idx + 2 # "dans 1 heure 40", "à 1 heure 40" if idxHr < len(words): # "3 heures 45" if words[idxHr].isdigit(): if wordPrev in words_in: minOffset = int(words[idxHr]) else: minAbs = int(words[idxHr]) used += 1 idxHr += 1 # "3 heures et quart", "4 heures et demi" elif words[idxHr] == "et" and idxHr + 1 < len(words): if words[idxHr + 1] == "quart": if wordPrev in words_in: minOffset = 15 else: minAbs = 15 used += 2 idxHr += 2 elif words[idxHr + 1] == "demi": if wordPrev in words_in: minOffset = 30 else: minAbs = 30 used += 2 idxHr += 2 # "5 heures moins 20", "6 heures moins le quart" elif words[idxHr] == "moins" and \ idxHr + 1 < len(words): if words[idxHr + 1].isdigit(): if wordPrev in words_in: hrOffset -= 1 minOffset = 60 - int(words[idxHr + 1]) else: hrAbs = hrAbs - 1 minAbs = 60 - int(words[idxHr + 1]) used += 2 idxHr += 2 elif words[idxHr + 1] == "quart": if wordPrev in words_in: hrOffset -= 1 minOffset = 45 else: hrAbs = hrAbs - 1 minAbs = 45 used += 2 idxHr += 2 # remove word minutes if present if idxHr < len(words) and \ words[idxHr] in ["minutes", "minute"]: used += 1 idxHr += 1 elif wordNext == "minutes": # "dans 10 minutes" if wordPrev in words_in: minOffset = int(word) else: minAbs = int(word) used = 2 elif wordNext == "secondes": # "dans 5 secondes" secOffset = int(word) used = 2 elif int(word) > 100: # format militaire hrAbs = int(word) / 100 minAbs = int(word) - hrAbs * 100 used = 1 if wordNext == "heures": used += 1 # handle am/pm if timeQualifier: if timeQualifier == "matin": ampm = "am" elif timeQualifier == "après-midi": ampm = "pm" elif timeQualifier == "soir": ampm = "pm" elif timeQualifier == "nuit": if hrAbs > 8: ampm = "pm" else: ampm = "am" hrAbs = hrAbs + 12 if ampm == "pm" and hrAbs < 12 else hrAbs hrAbs = hrAbs - 12 if ampm == "am" and hrAbs >= 12 else hrAbs if hrAbs > 24 or minAbs > 59: isTime = False used = 0 elif wordPrev in words_in: isTime = False else: isTime = True elif hrAbs == 0 and timeQualifier: if timeQualifier == "matin": hrAbs = 8 elif timeQualifier == "après-midi": hrAbs = 15 elif timeQualifier == "soir": hrAbs = 19 elif timeQualifier == "nuit": hrAbs = 2 isTime = True if used > 0: # removed parsed words from the sentence for i in range(0, used): words[i + start] = "" if start - 1 >= 0 and words[start - 1] in markers: words[start - 1] = "" idx += used - 1 found = True # check that we found a date if not date_found(): return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": if not hasYear: temp = datetime.strptime(datestr, "%B %d") temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: temp = datetime.strptime(datestr, "%B %d %Y") extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs != -1 and minAbs != -1: extractedDate = extractedDate + relativedelta(hours=hrAbs, minutes=minAbs) if (hrAbs != 0 or minAbs != 0) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "et" and words[idx - 1] == "" and words[ idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr] def isFractional_fr(input_str): """ This function takes the given text and checks if it is a fraction. Args: input_str (str): the string to check if fractional Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ input_str = input_str.lower() if input_str != "tiers" and input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "quarts" aFrac = ["entier", "demi", "tiers", "quart", "cinquième", "sixième", "septième", "huitième", "neuvième", "dixième", "onzième", "douzième", "treizième", "quatorzième", "quinzième", "seizième", "dix-septième", "dix-huitième", "dix-neuvième", "vingtième"] if input_str in aFrac: return 1.0 / (aFrac.index(input_str) + 1) if getOrdinal_fr(input_str): return 1.0 / getOrdinal_fr(input_str) if input_str == "trentième": return 1.0 / 30 if input_str == "centième": return 1.0 / 100 if input_str == "millième": return 1.0 / 1000 return False def normalize_fr(text, remove_articles): """ French string normalization """ text = text.lower() words = text.split() # this also removed extra spaces normalized = "" i = 0 while i < len(words): # remove articles if remove_articles and words[i] in articles_fr: i += 1 continue if remove_articles and words[i][:2] in ["l'", "d'"]: words[i] = words[i][2:] # remove useless punctuation signs if words[i] in ["?", "!", ";", "…"]: i += 1 continue # Normalize ordinal numbers if i > 0 and words[i - 1] in articles_fr: result = number_ordinal_fr(words, i) if result is not None: val, i = result normalized += " " + str(val) continue # Convert numbers into digits result = number_parse_fr(words, i) if result is not None: val, i = result normalized += " " + str(val) continue normalized += " " + words[i] i += 1 return normalized[1:] # strip the initial space