# -*- coding: iso-8859-15 -*- # # Copyright 2017 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta def extractnumber(text, lang="en-us"): """Takes in a string and extracts a number. Args: text (str): the string to extract a number from lang (str): the code for the language text is in Returns: (str): The number extracted or the original text. """ lang_lower = str(lang).lower() if lang_lower.startswith("en"): # return extractnumber_en(text, remove_articles) return extractnumber_en(text) elif lang_lower.startswith("pt"): return extractnumber_pt(text) # TODO: Normalization for other languages return text def extract_datetime(text, anchorDate=None, lang="en-us"): """ Parsing function that extracts date and time information from sentences. Parses many of the common ways that humans express dates and times. Includes relative dates like "5 days from today". Vague terminology are given arbitrary values, like: - morning = 8 AM - afternoon = 3 PM - evening = 7 PM If a time isn't supplied, the function defaults to 12 AM Args: str (string): the text to be normalized anchortDate (:obj:`datetime`, optional): the date to be used for relative dating (for example, what does "tomorrow" mean?). Defaults to the current date (acquired with datetime.datetime.now()) lang (string): the language of the sentence(s) Returns: [:obj:`datetime`, :obj:`str`]: 'datetime' is the extracted date as a datetime object. Times are represented in 24 hour notation. 'leftover_string' is the original phrase with all date and time related keywords stripped out. See examples for further clarification Returns 'None' if no date was extracted. Examples: >>> extract_datetime( ... "What is the weather like the day after tomorrow?", ... datetime(2017, 06, 30, 00, 00) ... ) [datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like'] >>> extract_datetime( ... "Set up an appointment 2 weeks from Sunday at 5 pm", ... datetime(2016, 02, 19, 00, 00) ... ) [datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment'] """ lang_lower = str(lang).lower() if lang_lower.startswith("en"): return extract_datetime_en(text, anchorDate) elif lang_lower.startswith("pt"): return extract_datetime_pt(text, anchorDate) return text def is_numeric(input_str): """ Takes in a string and tests to see if it is a number. Args: text (str): string to test if a number Returns: (bool): True if a number, else False """ try: float(input_str) return True except ValueError: return False def extractnumber_en(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ aWords = text.split() aWords = [word for word in aWords if word not in ["the", "a", "an"]] andPass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): # if word.isdigit(): # doesn't work with decimals val = float(word) elif word == "first": val = 1 elif word == "second": val = 2 elif isFractional_en(word): val = isFractional_en(word) else: if word == "one": val = 1 elif word == "two": val = 2 elif word == "three": val = 3 elif word == "four": val = 4 elif word == "five": val = 5 elif word == "six": val = 6 elif word == "seven": val = 7 elif word == "eight": val = 8 elif word == "nine": val = 9 elif word == "ten": val = 10 if val: if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = isFractional_en(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" # if val == False: if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif andPass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if (andPass): aWords[count - 1] = '' # remove "and" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'and': andPass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'and': andPass = True valPreAnd = val val = False count += 3 continue break # if val == False: if not val: return False # Return the $str with the number related words removed # (now empty strings, so strlen == 0) aWords = [word for word in aWords if len(word) > 0] text = ' '.join(aWords) return val def extract_datetime_en(str, currentDate=None): def clean_string(str): # cleans the input string of unneeded punctuation and capitalization # among other things str = str.lower().replace('?', '').replace('.', '').replace(',', '') \ .replace(' the ', ' ').replace(' a ', ' ').replace(' an ', ' ') wordList = str.split() for idx, word in enumerate(wordList): word = word.replace("'s", "") ordinals = ["rd", "st", "nd", "th"] if word[0].isdigit(): for ord in ordinals: if ord in word: word = word.replace(ord, "") wordList[idx] = word return wordList def date_found(): return found or \ ( datestr != "" or timeStr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs != 0 or minOffset != 0 or minAbs != 0 or secOffset != 0 ) if str == "": return None if currentDate is None: currentDate = datetime.now() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = currentDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" timeQualifiersList = ['morning', 'afternoon', 'evening'] markers = ['at', 'in', 'on', 'by', 'this', 'around', 'for', 'of'] days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] words = clean_string(str) for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # this isn't in clean string because I don't want to save back to words word = word.rstrip('s') start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, day after tomorrow elif word == "today" and not fromFlag: dayOffset = 0 used += 1 elif word == "tomorrow" and not fromFlag: dayOffset = 1 used += 1 elif (word == "day" and wordNext == "after" and wordNextNext == "tomorrow" and not fromFlag and not wordPrev[0].isdigit()): dayOffset = 2 used = 3 if wordPrev == "the": start -= 1 used += 1 # parse 5 days, 10 weeks, last week, next week elif word == "day": if wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used = 2 elif word == "week" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 elif wordPrev == "next": dayOffset = 7 start -= 1 used = 2 elif wordPrev == "last": dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "month" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "next": monthOffset = 1 start -= 1 used = 2 elif wordPrev == "last": monthOffset = -1 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "year" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 elif wordPrev == "next": yearOffset = 1 start -= 1 used = 2 elif wordPrev == "last": yearOffset = -1 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 if wordPrev == "next": dayOffset += 7 used += 1 start -= 1 elif wordPrev == "last": dayOffset -= 7 used += 1 start -= 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort and not fromFlag: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev[0].isdigit() or \ (wordPrev == "of" and wordPrevPrev[0].isdigit()): if wordPrev == "of" and wordPrevPrev[0].isdigit(): datestr += " " + words[idx - 2] used += 1 start -= 1 else: datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort validFollowups.append("today") validFollowups.append("tomorrow") validFollowups.append("next") validFollowups.append("last") validFollowups.append("now") if (word == "from" or word == "after") and wordNext in validFollowups: used = 2 fromFlag = True if wordNext == "tomorrow": dayOffset += 1 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if tmpOffset < 0: tmpOffset += 7 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNext == "next": tmpOffset += 7 used += 1 start -= 1 elif wordNext == "last": tmpOffset -= 7 used += 1 start -= 1 dayOffset += tmpOffset if used > 0: if start - 1 > 0 and words[start - 1] == "this": start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if (start - 1 >= 0 and words[start - 1] in markers): words[start - 1] = "" found = True daySpecified = True # parse time timeStr = "" hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = 0 minAbs = 0 military = False for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "noon": hrAbs = 12 used += 1 elif word == "midnight": hrAbs = 0 used += 1 elif word == "morning": if hrAbs == 0: hrAbs = 8 used += 1 elif word == "afternoon": if hrAbs == 0: hrAbs = 15 used += 1 elif word == "evening": if hrAbs == 0: hrAbs = 19 used += 1 # parse half an hour, quarter hour elif word == "hour" and \ (wordPrev in markers or wordPrevPrev in markers): if wordPrev == "half": minOffset = 30 elif wordPrev == "quarter": minOffset = 15 elif wordPrevPrev == "quarter": minOffset = 15 if idx > 2 and words[idx - 3] in markers: words[idx - 3] = "" words[idx - 2] = "" else: hrOffset = 1 if wordPrevPrev in markers: words[idx - 2] = "" words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif wordNext == "in" and wordNextNext == "the" and \ words[idx + 3] == "morning": reaminder = "am" used += 3 elif wordNext == "in" and wordNextNext == "the" and \ words[idx + 3] == "afternoon": remainder = "pm" used += 3 elif wordNext == "in" and wordNextNext == "the" and \ words[idx + 3] == "evening": remainder = "pm" used += 3 elif wordNext == "in" and wordNextNext == "morning": remainder = "am" used += 2 elif wordNext == "in" and wordNextNext == "afternoon": remainder = "pm" used += 2 elif wordNext == "in" and wordNextNext == "evening": remainder = "pm" used += 2 elif wordNext == "this" and wordNextNext == "morning": remainder = "am" used = 2 elif wordNext == "this" and wordNextNext == "afternoon": remainder = "pm" used = 2 elif wordNext == "this" and wordNextNext == "evening": remainder = "pm" used = 2 elif wordNext == "at" and wordNextNext == "night": if strHH > 5: remainder = "pm" else: remainder = "am" used += 2 else: if timeQualifier != "": military = True if strHH <= 12 and \ (timeQualifier == "evening" or timeQualifier == "afternoon"): strHH += 12 else: # try to parse # s without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 else: if wordNext == "pm" or wordNext == "p.m.": strHH = strNum reaminder = "pm" used = 1 elif wordNext == "am" or wordNext == "a.m.": strHH = strNum remainder = "am" used = 1 elif ( int(word) > 100 and ( wordPrev == "o" or wordPrev == "oh" )): # 0800 hours (pronounced oh-eight-hundred) strHH = int(word) / 100 strMM = int(word) - strHH * 100 military = True if wordNext == "hours": used += 1 elif ( wordNext == "hours" and word[0] != '0' and ( int(word) < 100 and int(word) > 2400 )): # ignores military time # "in 3 hours" hrOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minutes": # "in 10 minutes" minOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "seconds": # in 5 seconds secOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(word) > 100: strHH = int(word) / 100 strMM = int(word) - strHH * 100 military = True if wordNext == "hours": used += 1 elif wordNext[0].isdigit(): strHH = word strMM = wordNext military = True used += 1 if wordNextNext == "hours": used += 1 elif ( wordNext == "" or wordNext == "o'clock" or ( wordNext == "in" and ( wordNextNext == "the" or wordNextNext == timeQualifier ) )): strHH = word strMM = 00 if wordNext == "o'clock": used += 1 if wordNext == "in" or wordNextNext == "in": used += (1 if wordNext == "in" else 2) if (wordNextNext and wordNextNext in timeQualifier or (words[words.index(wordNextNext) + 1] and words[words.index(wordNextNext) + 1] in timeQualifier)): if (wordNextNext == "afternoon" or (len(words) > words.index(wordNextNext) + 1 and words[words.index( wordNextNext) + 1] == "afternoon")): remainder = "pm" if (wordNextNext == "evening" or (len(words) > (words.index(wordNextNext) + 1) and words[words.index( wordNextNext) + 1] == "evening")): remainder = "pm" if (wordNextNext == "morning" or (len(words) > words.index(wordNextNext) + 1 and words[words.index( wordNextNext) + 1] == "morning")): remainder = "am" if timeQualifier != "": military = True else: isTime = False strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH if strHH > 24 or strMM > 59: isTime = False used = 0 if isTime: hrAbs = strHH * 1 minAbs = strMM * 1 used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = "" if wordPrev == "o" or wordPrev == "oh": words[words.index(wordPrev)] = "" if wordPrev == "early": hrOffset = -1 words[idx - 1] = "" idx -= 1 elif wordPrev == "late": hrOffset = 1 words[idx - 1] = "" idx -= 1 if idx > 0 and wordPrev in markers: words[idx - 1] = "" if idx > 1 and wordPrevPrev in markers: words[idx - 2] = "" idx += used - 1 found = True # check that we found a date if not date_found: return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": temp = datetime.strptime(datestr, "%B %d") if not hasYear: temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if timeStr != "": temp = datetime(timeStr) extractedDate = extractedDate.replace(hour=temp.strftime("%H"), minute=temp.strftime("%M"), second=temp.strftime("%S")) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs != -1 and minAbs != -1: extractedDate = extractedDate + relativedelta(hours=hrAbs, minutes=minAbs) if (hrAbs != 0 or minAbs != 0) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) for idx, word in enumerate(words): if words[idx] == "and" and words[idx - 1] == "" and words[ idx + 1] == "": words[idx] = "" resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) return [extractedDate, resultStr] def look_for_fractions(split_list): """" This function takes a list made by fraction & determines if a fraction. Args: split_list (list): list created by splitting on '/' Returns: (bool): False if not a fraction, otherwise True """ if len(split_list) == 2: if is_numeric(split_list[0]) and is_numeric(split_list[1]): return True return False def isFractional_en(input_str): """ This function takes the given text and checks if it is a fraction. Args: text (str): the string to check if fractional Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "fifths" aFrac = ["whole", "half", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth"] if input_str.lower() in aFrac: return 1.0 / (aFrac.index(input_str) + 1) if input_str == "quarter": return 1.0 / 4 return False def get_gender(word, input_string="", lang="en-us"): ''' guess gender of word, optionally use raw input text for context returns "m" if the word is male, "f" if female, False if unknown ''' if "pt" in lang or "es" in lang: # spanish follows same rules return get_gender_pt(word, input_string) return False # ============================================================== def normalize(text, lang="en-us", remove_articles=True): """Prepare a string for parsing This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize lang (str): the code for the language text is in remove_articles (bool): whether to remove articles (like 'a', or 'the') Returns: (str): The normalized string. """ lang_lower = str(lang).lower() if lang_lower.startswith("en"): return normalize_en(text, remove_articles) elif lang_lower.startswith("es"): return normalize_es(text, remove_articles) elif lang_lower.startswith("pt"): return normalize_pt(text, remove_articles) # TODO: Normalization for other languages return text def normalize_en(text, remove_articles): """ English string normalization """ words = text.split() # this also removed extra spaces normalized = "" for word in words: if remove_articles and word in ["the", "a", "an"]: continue # Expand common contractions, e.g. "isn't" -> "is not" contraction = ["ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "gonna", "gotta", "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "I'd", "I'll", "I'm", "I've", "isn't", "it'd", "it'll", "it's", "mightn't", "might've", "mustn't", "must've", "needn't", "oughtn't", "shan't", "she'd", "she'll", "she's", "shouldn't", "should've", "somebody's", "someone'd", "someone'll", "someone's", "that'll", "that's", "that'd", "there'd", "there're", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "we've", "weren't", "what'd", "what'll", "what're", "what's", "whats", # technically incorrect but some STT outputs "what've", "when's", "when'd", "where'd", "where's", "where've", "who'd", "who'd've", "who'll", "who're", "who's", "who've", "why'd", "why're", "why's", "won't", "won't've", "would've", "wouldn't", "wouldn't've", "y'all", "ya'll", "you'd", "you'd've", "you'll", "y'aint", "y'ain't", "you're", "you've"] if word in contraction: expansion = ["is not", "are not", "can not", "could have", "could not", "did not", "does not", "do not", "going to", "got to", "had not", "has not", "have not", "he would", "he will", "he is", "how did", "how will", "how is", "I would", "I will", "I am", "I have", "is not", "it would", "it will", "it is", "might not", "might have", "must not", "must have", "need not", "ought not", "shall not", "she would", "she will", "she is", "should not", "should have", "somebody is", "someone would", "someone will", "someone is", "that will", "that is", "that would", "there would", "there are", "there is", "they would", "they will", "they are", "they have", "was not", "we would", "we will", "we are", "we have", "were not", "what did", "what will", "what are", "what is", "what is", "what have", "when is", "when did", "where did", "where is", "where have", "who would", "who would have", "who will", "who are", "who is", "who have", "why did", "why are", "why is", "will not", "will not have", "would have", "would not", "would not have", "you all", "you all", "you would", "you would have", "you will", "you are not", "you are not", "you are", "you have"] word = expansion[contraction.index(word)] # Convert numbers into digits, e.g. "two" -> "2" textNumbers = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty"] if word in textNumbers: word = str(textNumbers.index(word)) normalized += " " + word return normalized[1:] # strip the initial space #################################################################### # PT-PT # # TODO: numbers greater than 999999 # TODO: date time pt #################################################################### # Undefined articles ["um", "uma", "uns", "umas"] can not be supressed, # in PT, "um cavalo" means "a horse" or "one horse". pt_articles = ["o", "a", "os", "as"] pt_numbers = { "zero": 0, "um": 1, "uma": 1, "uns": 1, "umas": 1, "primeiro": 1, "segundo": 2, "terceiro": 3, "dois": 2, "duas": 2, "tres": 3, u"tr�s": 3, "quatro": 4, "cinco": 5, "seis": 6, "sete": 7, "oito": 8, "nove": 9, "dez": 10, "onze": 11, "doze": 12, "treze": 13, "catorze": 14, "quinze": 15, "dezasseis": 16, "dezassete": 17, "dezoito": 18, "dezanove": 19, "vinte": 20, "trinta": 30, "quarenta": 40, "cinquenta": 50, "sessenta": 60, "setenta": 70, "oitenta": 80, "noventa": 90, "cem": 100, "cento": 100, "duzentos": 200, "duzentas": 200, "trezentos": 300, "trezentas": 300, "quatrocentos": 400, "quatrocentas": 400, "quinhentos": 500, "quinhentas": 500, "seiscentos": 600, "seiscentas": 600, "setecentos": 700, "setecentas": 700, "oitocentos": 800, "oitocentas": 800, "novecentos": 900, "novecentas": 900, "mil": 1000, u"milh�o": 1000000} def isFractional_pt(input_str): """ This function takes the given text and checks if it is a fraction. Args: text (str): the string to check if fractional Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ if input_str.endswith('s', -1): input_str = input_str[:len(input_str) - 1] # e.g. "fifths" aFrac = ["meio", u"ter�o", "quarto", "quinto", "sexto", "setimo", "oitavo", "nono", u"d�cimo"] if input_str.lower() in aFrac: return 1.0 / (aFrac.index(input_str) + 2) if input_str == u"vig�simo": return 1.0 / 20 if input_str == u"trig�simo": return 1.0 / 30 if input_str == u"cent�simo": return 1.0 / 100 if input_str == u"mil�simo": return 1.0 / 1000 if (input_str == u"s�timo" or input_str == "septimo" or input_str == u"s�ptimo"): return 1.0 / 7 return False def extractnumber_pt(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ aWords = text.split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in pt_numbers: val = pt_numbers[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif isFractional_pt(word): if not result: result = 1 result = result * isFractional_pt(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if val: if result is None: result = 0 # handle fractions if next_word != "avos": result += val else: result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["e"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_pt(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break for i in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_pt(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["ponto", "virgula", u"v�rgula", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break afterDotVal = str(extractnumber_pt(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 if result is None: return False # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result def pt_number_parse(words, i): def pt_cte(i, s): if i < len(words) and s == words[i]: return s, i + 1 return None def pt_number_word(i, mi, ma): if i < len(words): v = pt_numbers.get(words[i]) if v and v >= mi and v <= ma: return v, i + 1 return None def pt_number_1_99(i): r1 = pt_number_word(i, 1, 29) if r1: return r1 r1 = pt_number_word(i, 30, 90) if r1: v1, i1 = r1 r2 = pt_cte(i1, "e") if r2: v2, i2 = r2 r3 = pt_number_word(i2, 1, 9) if r3: v3, i3 = r3 return v1 + v3, i3 return r1 return None def pt_number_1_999(i): # [2-9]cientos [1-99]? r1 = pt_number_word(i, 100, 900) if r1: v1, i1 = r1 r2 = pt_number_1_99(i1) if r2: v2, i2 = r2 return v1 + v2, i2 else: return r1 # [1-99] r1 = pt_number_1_99(i) if r1: return r1 return None def pt_number(i): # check for cero r1 = pt_number_word(i, 0, 0) if r1: return r1 # check for [1-999] (mil [0-999])? r1 = pt_number_1_999(i) if r1: v1, i1 = r1 r2 = pt_cte(i1, "mil") if r2: v2, i2 = r2 r3 = pt_number_1_999(i2) if r3: v3, i3 = r3 return v1 * 1000 + v3, i3 else: return v1 * 1000, i2 else: return r1 return None return pt_number(i) def normalize_pt(text, remove_articles): """ PT string normalization """ words = text.split() # this also removed extra spaces normalized = "" # Contractions are not common in PT # Convert numbers into digits, e.g. "dois" -> "2" normalized = "" i = 0 while i < len(words): word = words[i] # remove articles if remove_articles and word in pt_articles: i += 1 continue # Convert numbers into digits r = pt_number_parse(words, i) if r: v, i = r normalized += " " + str(v) continue # NOTE temporary , handle some numbers above >999 if word in pt_numbers: word = str(pt_numbers[word]) # end temporary normalized += " " + word i += 1 # some articles in pt-pt can not be removed, but many words can # this is experimental and some meaning may be lost # maybe agressive should default to False # only usage will tell, as a native speaker this seems reasonable return pt_pruning(normalized[1:], agressive=remove_articles) def extract_datetime_pt(input_str, currentDate=None): def clean_string(str): # cleans the input string of unneeded punctuation and capitalization # among other things symbols = [".", ",", ";", "?", "!", u"�", u"�"] noise_words = ["o", "os", "a", "as", "do", "da", "dos", "das", "de", "ao", "aos"] for word in symbols: str = str.replace(word, "") for word in noise_words: str = str.replace(" " + word + " ", " ") str = str.lower().replace( u"�", "a").replace( u"�", "c").replace( u"�", "a").replace( u"�", "a").replace( u"�", "e").replace( u"�", "e").replace( u"�", "e").replace( u"�", "o").replace( u"�", "o").replace( "-", " ").replace( "_", "") # handle synonims and equivalents, "tomorrow early = tomorrow morning synonims = {"manha": ["manhazinha", "cedo", "cedinho"], "tarde": ["tardinha", "tarde"], "noite": ["noitinha", "anoitecer"], "todos": ["ao", "aos"], "em": ["do", "da", "dos", "das", "de"]} for syn in synonims: for word in synonims[syn]: str = str.replace(" " + word + " ", " " + syn + " ") # relevant plurals, cant just extract all s in pt wordlist = ["manhas", "noites", "tardes", "dias", "semanas", "anos", "minutos", "segundos", "nas", "nos", "proximas", "seguintes", "horas"] for idx, word in enumerate(wordlist): str = str.replace(word, word.rstrip('s')) str = str.replace("meses", "mes").replace("anteriores", "anterior") return str def date_found(): return found or \ ( datestr != "" or timeStr != "" or yearOffset != 0 or monthOffset != 0 or dayOffset is True or hrOffset != 0 or hrAbs != 0 or minOffset != 0 or minAbs != 0 or secOffset != 0 ) if input_str == "": return None if currentDate is None: currentDate = datetime.now() found = False daySpecified = False dayOffset = False monthOffset = 0 yearOffset = 0 dateNow = currentDate today = dateNow.strftime("%w") currentYear = dateNow.strftime("%Y") fromFlag = False datestr = "" hasYear = False timeQualifier = "" words = clean_string(input_str).split(" ") timeQualifiersList = ['manha', 'tarde', 'noite'] time_indicators = ["em", "as", "nas", "pelas", "volta", "depois", "estas", "no", "dia", "hora"] days = ['segunda', 'terca', 'quarta', 'quinta', 'sexta', 'sabado', 'domingo'] months = ['janeiro', 'febreiro', 'marco', 'abril', 'maio', 'junho', 'julho', 'agosto', 'setembro', 'outubro', 'novembro', 'dezembro'] monthsShort = ['jan', 'feb', 'mar', 'abr', 'mai', 'jun', 'jul', 'ag', 'set', 'out', 'nov', 'dec'] nexts = ["proximo", "proxima"] suffix_nexts = ["seguinte", "subsequente", "seguir"] lasts = ["ultimo", "ultima"] suffix_lasts = ["passada", "passado", "anterior", "antes"] nxts = ["depois", "seguir", "seguida", "seguinte", "proxima", "proximo"] prevs = ["antes", "ante", "previa", "previamente", "anterior"] froms = ["partir", "em", "para", "na", "no", "daqui", "seguir", "depois", "por", "proxima", "proximo", "da", "do", "de"] thises = ["este", "esta", "deste", "desta", "neste", "nesta", "nesse", "nessa"] froms += thises lists = nxts + prevs + froms + time_indicators for idx, word in enumerate(words): if word == "": continue wordPrevPrevPrev = words[idx - 3] if idx > 2 else "" wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" start = idx used = 0 # save timequalifier for later if word in timeQualifiersList: timeQualifier = word # parse today, tomorrow, yesterday elif word == "hoje" and not fromFlag: dayOffset = 0 used += 1 elif word == "amanha" and not fromFlag: dayOffset = 1 used += 1 elif word == "ontem" and not fromFlag: dayOffset -= 1 used += 1 # "before yesterday" and "before before yesterday" elif (word == "anteontem" or (word == "ante" and wordNext == "ontem")) and not fromFlag: dayOffset -= 2 used += 1 if wordNext == "ontem": used += 1 elif word == "ante" and wordNext == "ante" and wordNextNext == \ "ontem" and not fromFlag: dayOffset -= 3 used += 3 elif word == "anteanteontem" and not fromFlag: dayOffset -= 3 used += 1 # day after tomorrow elif word == "depois" and wordNext == "amanha" and not fromFlag: dayOffset += 2 used = 2 # day before yesterday elif word == "antes" and wordNext == "ontem" and not fromFlag: dayOffset -= 2 used = 2 # parse 5 days, 10 weeks, last week, next week, week after elif word == "dia": if wordNext == "depois" or wordNext == "antes": used += 1 if wordPrev and wordPrev[0].isdigit(): dayOffset += int(wordPrev) start -= 1 used += 1 elif (wordPrev and wordPrev[0].isdigit() and wordNext not in months and wordNext not in monthsShort): dayOffset += int(wordPrev) start -= 1 used += 2 elif wordNext and wordNext[0].isdigit() and wordNextNext not in \ months and wordNextNext not in monthsShort: dayOffset += int(wordNext) start -= 1 used += 2 elif word == "semana" and not fromFlag: if wordPrev[0].isdigit(): dayOffset += int(wordPrev) * 7 start -= 1 used = 2 for w in nexts: if wordPrev == w: dayOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: dayOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: dayOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: dayOffset = -7 start -= 1 used = 2 # parse 10 months, next month, last month elif word == "mes" and not fromFlag: if wordPrev[0].isdigit(): monthOffset = int(wordPrev) start -= 1 used = 2 for w in nexts: if wordPrev == w: monthOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: monthOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: monthOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: monthOffset = -7 start -= 1 used = 2 # parse 5 years, next year, last year elif word == "ano" and not fromFlag: if wordPrev[0].isdigit(): yearOffset = int(wordPrev) start -= 1 used = 2 for w in nexts: if wordPrev == w: yearOffset = 7 start -= 1 used = 2 for w in lasts: if wordPrev == w: yearOffset = -7 start -= 1 used = 2 for w in suffix_nexts: if wordNext == w: yearOffset = 7 start -= 1 used = 2 for w in suffix_lasts: if wordNext == w: yearOffset = -7 start -= 1 used = 2 # parse Monday, Tuesday, etc., and next Monday, # last Tuesday, etc. elif word in days and not fromFlag: d = days.index(word) dayOffset = (d + 1) - int(today) used = 1 if dayOffset < 0: dayOffset += 7 for w in nexts: if wordPrev == w: dayOffset += 7 used += 1 start -= 1 for w in lasts: if wordPrev == w: dayOffset -= 7 used += 1 start -= 1 for w in suffix_nexts: if wordNext == w: dayOffset += 7 used += 1 start -= 1 for w in suffix_lasts: if wordNext == w: dayOffset -= 7 used += 1 start -= 1 if wordNext == "feira": used += 1 # parse 15 of July, June 20th, Feb 18, 19 of February elif word in months or word in monthsShort: try: m = months.index(word) except ValueError: m = monthsShort.index(word) used += 1 datestr = months[m] if wordPrev and wordPrev[0].isdigit(): # 13 maio datestr += " " + wordPrev start -= 1 used += 1 if wordNext and wordNext[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNext and wordNext[0].isdigit(): # maio 13 datestr += " " + wordNext used += 1 if wordNextNext and wordNextNext[0].isdigit(): datestr += " " + wordNextNext used += 1 hasYear = True else: hasYear = False elif wordPrevPrev and wordPrevPrev[0].isdigit(): # 13 dia maio datestr += " " + wordPrevPrev start -= 2 used += 2 if wordNext and word[0].isdigit(): datestr += " " + wordNext used += 1 hasYear = True else: hasYear = False elif wordNextNext and wordNextNext[0].isdigit(): # maio dia 13 datestr += " " + wordNextNext used += 2 if wordNextNextNext and wordNextNextNext[0].isdigit(): datestr += " " + wordNextNextNext used += 1 hasYear = True else: hasYear = False if datestr in months: datestr = "" # parse 5 days from tomorrow, 10 weeks from next thursday, # 2 months from July validFollowups = days + months + monthsShort validFollowups.append("hoje") validFollowups.append("amanha") validFollowups.append("ontem") validFollowups.append("anteontem") validFollowups.append("agora") validFollowups.append("ja") validFollowups.append("ante") # TODO debug word "depois" that one is failing for some reason if word in froms and wordNext in validFollowups: if not (wordNext == "amanha" and wordNext == "ontem") and not ( word == "depois" or word == "antes" or word == "em"): used = 2 fromFlag = True if wordNext == "amanha" and word != "depois": dayOffset += 1 elif wordNext == "ontem": dayOffset -= 1 elif wordNext == "anteontem": dayOffset -= 2 elif wordNext == "ante" and wordNextNext == "ontem": dayOffset -= 2 elif (wordNext == "ante" and wordNext == "ante" and wordNextNextNext == "ontem"): dayOffset -= 3 elif wordNext in days: d = days.index(wordNext) tmpOffset = (d + 1) - int(today) used = 2 if wordNextNext == "feira": used += 1 if tmpOffset < 0: tmpOffset += 7 if wordNextNext: if wordNextNext in nxts: tmpOffset += 7 used += 1 elif wordNextNext in prevs: tmpOffset -= 7 used += 1 dayOffset += tmpOffset elif wordNextNext and wordNextNext in days: d = days.index(wordNextNext) tmpOffset = (d + 1) - int(today) used = 3 if wordNextNextNext: if wordNextNextNext in nxts: tmpOffset += 7 used += 1 elif wordNextNextNext in prevs: tmpOffset -= 7 used += 1 dayOffset += tmpOffset if wordNextNextNext == "feira": used += 1 if wordNext in months: used -= 1 if used > 0: if start - 1 > 0 and words[start - 1] in lists: start -= 1 used += 1 for i in range(0, used): words[i + start] = "" if (start - 1 >= 0 and words[start - 1] in lists): words[start - 1] = "" found = True daySpecified = True # parse time timeStr = "" hrOffset = 0 minOffset = 0 secOffset = 0 hrAbs = 0 minAbs = 0 military = False for idx, word in enumerate(words): if word == "": continue wordPrevPrev = words[idx - 2] if idx > 1 else "" wordPrev = words[idx - 1] if idx > 0 else "" wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" # parse noon, midnight, morning, afternoon, evening used = 0 if word == "meio" and wordNext == "dia": hrAbs = 12 used += 2 elif word == "meia" and wordNext == "noite": hrAbs = 0 used += 2 elif word == "manha": if hrAbs == 0: hrAbs = 8 used += 1 elif word == "tarde": if hrAbs == 0: hrAbs = 15 used += 1 elif word == "meio" and wordNext == "tarde": if hrAbs == 0: hrAbs = 17 used += 2 elif word == "meio" and wordNext == "manha": if hrAbs == 0: hrAbs = 10 used += 2 elif word == "fim" and wordNext == "tarde": if hrAbs == 0: hrAbs = 19 used += 2 elif word == "fim" and wordNext == "manha": if hrAbs == 0: hrAbs = 11 used += 2 elif word == "tantas" and wordNext == "manha": if hrAbs == 0: hrAbs = 4 used += 2 elif word == "noite": if hrAbs == 0: hrAbs = 22 used += 1 # parse half an hour, quarter hour elif word == "hora" and \ (wordPrev in time_indicators or wordPrevPrev in time_indicators): if wordPrev == "meia": minOffset = 30 elif wordPrev == "quarto": minOffset = 15 elif wordPrevPrev == "quarto": minOffset = 15 if idx > 2 and words[idx - 3] in time_indicators: words[idx - 3] = "" words[idx - 2] = "" else: hrOffset = 1 if wordPrevPrev in time_indicators: words[idx - 2] = "" words[idx - 1] = "" used += 1 hrAbs = -1 minAbs = -1 # parse 5:00 am, 12:00 p.m., etc elif word[0].isdigit(): isTime = True strHH = "" strMM = "" remainder = "" if ':' in word: # parse colons # "3:00 in the morning" stage = 0 length = len(word) for i in range(length): if stage == 0: if word[i].isdigit(): strHH += word[i] elif word[i] == ":": stage = 1 else: stage = 2 i -= 1 elif stage == 1: if word[i].isdigit(): strMM += word[i] else: stage = 2 i -= 1 elif stage == 2: remainder = word[i:].replace(".", "") break if remainder == "": nextWord = wordNext.replace(".", "") if nextWord == "am" or nextWord == "pm": remainder = nextWord used += 1 elif wordNext == "manha": remainder = "am" used += 1 elif wordNext == "tarde": remainder = "pm" used += 1 elif wordNext == "noite": if 0 < int(word[0]) < 6: remainder = "am" else: remainder = "pm" used += 1 elif wordNext in thises and wordNextNext == "manha": remainder = "am" used = 2 elif wordNext in thises and wordNextNext == "tarde": remainder = "pm" used = 2 elif wordNext in thises and wordNextNext == "noite": remainder = "pm" used = 2 else: if timeQualifier != "": military = True if strHH <= 12 and \ (timeQualifier == "manha" or timeQualifier == "tarde"): strHH += 12 else: # try to parse # s without colons # 5 hours, 10 minutes etc. length = len(word) strNum = "" remainder = "" for i in range(length): if word[i].isdigit(): strNum += word[i] else: remainder += word[i] if remainder == "": remainder = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or wordNext == "pm" or remainder == "p.m." or wordNext == "p.m."): strHH = strNum remainder = "pm" used = 1 elif ( remainder == "am" or wordNext == "am" or remainder == "a.m." or wordNext == "a.m."): strHH = strNum remainder = "am" used = 1 else: if (wordNext == "pm" or wordNext == "p.m." or wordNext == "tarde"): strHH = strNum remainder = "pm" used = 1 elif (wordNext == "am" or wordNext == "a.m." or wordNext == "manha"): strHH = strNum remainder = "am" used = 1 elif (int(word) > 100 and ( wordPrev == "o" or wordPrev == "oh" or wordPrev == "zero" )): # 0800 hours (pronounced oh-eight-hundred) strHH = int(word) / 100 strMM = int(word) - strHH * 100 military = True if wordNext == "hora": used += 1 elif ( wordNext == "hora" and word[0] != '0' and ( int(word) < 100 and int(word) > 2400 )): # ignores military time # "in 3 hours" hrOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "minuto": # "in 10 minutes" minOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif wordNext == "segundo": # in 5 seconds secOffset = int(word) used = 2 isTime = False hrAbs = -1 minAbs = -1 elif int(word) > 100: strHH = int(word) / 100 strMM = int(word) - strHH * 100 military = True if wordNext == "hora": used += 1 elif wordNext == "" or ( wordNext == "em" and wordNextNext == "ponto"): strHH = word strMM = 00 if wordNext == "em" and wordNextNext == "ponto": used += 2 if wordNextNextNext == "tarde": remainder = "pm" used += 1 elif wordNextNextNext == "manha": remainder = "am" used += 1 elif wordNextNextNext == "noite": if 0 > strHH > 6: remainder = "am" else: remainder = "pm" used += 1 elif wordNext[0].isdigit(): strHH = word strMM = wordNext military = True used += 1 if wordNextNext == "hora": used += 1 else: isTime = False strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 strHH = strHH + 12 if (remainder == "pm" and 0 < strHH < 12) else strHH strHH = strHH - 12 if (remainder == "am" and 0 < strHH >= 12) else strHH if strHH > 24 or strMM > 59: isTime = False used = 0 if isTime: hrAbs = strHH * 1 minAbs = strMM * 1 used += 1 if used > 0: # removed parsed words from the sentence for i in range(used): words[idx + i] = "" if wordPrev == "em" or wordPrev == "ponto": words[words.index(wordPrev)] = "" if idx > 0 and wordPrev in time_indicators: words[idx - 1] = "" if idx > 1 and wordPrevPrev in time_indicators: words[idx - 2] = "" idx += used - 1 found = True # check that we found a date if not date_found: return None if dayOffset is False: dayOffset = 0 # perform date manipulation extractedDate = dateNow extractedDate = extractedDate.replace(microsecond=0, second=0, minute=0, hour=0) if datestr != "": en_months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'] en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july', 'aug', 'sept', 'oct', 'nov', 'dec'] for idx, en_month in enumerate(en_months): datestr = datestr.replace(months[idx], en_month) for idx, en_month in enumerate(en_monthsShort): datestr = datestr.replace(monthsShort[idx], en_month) temp = datetime.strptime(datestr, "%B %d") if not hasYear: temp = temp.replace(year=extractedDate.year) if extractedDate < temp: extractedDate = extractedDate.replace(year=int(currentYear), month=int( temp.strftime( "%m")), day=int(temp.strftime( "%d"))) else: extractedDate = extractedDate.replace( year=int(currentYear) + 1, month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) else: extractedDate = extractedDate.replace( year=int(temp.strftime("%Y")), month=int(temp.strftime("%m")), day=int(temp.strftime("%d"))) if timeStr != "": temp = datetime(timeStr) extractedDate = extractedDate.replace(hour=temp.strftime("%H"), minute=temp.strftime("%M"), second=temp.strftime("%S")) if yearOffset != 0: extractedDate = extractedDate + relativedelta(years=yearOffset) if monthOffset != 0: extractedDate = extractedDate + relativedelta(months=monthOffset) if dayOffset != 0: extractedDate = extractedDate + relativedelta(days=dayOffset) if hrAbs != -1 and minAbs != -1: extractedDate = extractedDate + relativedelta(hours=hrAbs, minutes=minAbs) if (hrAbs != 0 or minAbs != 0) and datestr == "": if not daySpecified and dateNow > extractedDate: extractedDate = extractedDate + relativedelta(days=1) if hrOffset != 0: extractedDate = extractedDate + relativedelta(hours=hrOffset) if minOffset != 0: extractedDate = extractedDate + relativedelta(minutes=minOffset) if secOffset != 0: extractedDate = extractedDate + relativedelta(seconds=secOffset) resultStr = " ".join(words) resultStr = ' '.join(resultStr.split()) resultStr = pt_pruning(resultStr) return [extractedDate, resultStr] def pt_pruning(text, symbols=True, accents=True, agressive=True): # agressive pt word pruning words = ["a", "o", "os", "as", "de", "dos", "das", "lhe", "lhes", "me", "e", "no", "nas", "na", "nos", "em", "para", "este", "esta", "deste", "desta", "neste", "nesta", "nesse", "nessa", "foi", "que"] if symbols: symbols = [".", ",", ";", ":", "!", "?", u"�", u"�"] for symbol in symbols: text = text.replace(symbol, "") text = text.replace("-", " ").replace("_", " ") if accents: accents = {"a": [u"�", u"�", u"�", u"�"], "e": [u"�", u"�", u"�"], "i": [u"�", u"�"], "o": [u"�", u"�"], "u": [u"�", u"�"], "c": [u"�", u"�"]} for char in accents: for acc in accents[char]: text = text.replace(acc, char) if agressive: text_words = text.split(" ") for idx, word in enumerate(text_words): if word in words: text_words[idx] = "" text = " ".join(text_words) text = ' '.join(text.split()) return text def get_gender_pt(word, raw_string=""): word = word.rstrip("s") gender = False words = raw_string.split(" ") for idx, w in enumerate(words): if w == word and idx != 0: previous = words[idx - 1] gender = get_gender_pt(previous) break if not gender: if word[-1] == "a": gender = "f" if word[-1] == "o" or word[-1] == "e": gender = "m" return gender #################################################################### # Spanish normalization # # TODO: numbers greater than 999999 #################################################################### # Undefined articles ["un", "una", "unos", "unas"] can not be supressed, # in Spanish, "un caballo" means "a horse" or "one horse". es_articles = ["el", "la", "los", "las"] es_numbers_xlat = { "un": 1, "uno": 1, "una": 1, "dos": 2, "tres": 3, u"tr�s": 3, "cuatro": 4, "cinco": 5, "seis": 6, "siete": 7, "ocho": 8, "nueve": 9, "diez": 10, "once": 11, "doce": 12, "trece": 13, "catorce": 14, "quince": 15, "dieciseis": 16, u"diecis�is": 16, "diecisiete": 17, "dieciocho": 18, "diecinueve": 19, "veinte": 20, "veintiuno": 21, u"veintid�s": 22, u"veintitr�s": 23, "veintidos": 22, "veintitres": 23, "veinticuatro": 24, "veinticinco": 25, u"veintis�is": 26, "veintiseis": 26, "veintisiete": 27, "veintiocho": 28, "veintinueve": 29, "treinta": 30, "cuarenta": 40, "cincuenta": 50, "sesenta": 60, "setenta": 70, "ochenta": 80, "noventa": 90, "cien": 100, "ciento": 100, "doscientos": 200, "doscientas": 200, "trescientos": 300, "trescientas": 300, "cuatrocientos": 400, "cuatrocientas": 400, "quinientos": 500, "quinientas": 500, "seiscientos": 600, "seiscientas": 600, "setecientos": 700, "setecientas": 700, "ochocientos": 800, "ochocientas": 800, "novecientos": 900, "novecientas": 900} def es_parse(words, i): def es_cte(i, s): if i < len(words) and s == words[i]: return s, i + 1 return None def es_number_word(i, mi, ma): if i < len(words): v = es_numbers_xlat.get(words[i]) if v and v >= mi and v <= ma: return v, i + 1 return None def es_number_1_99(i): r1 = es_number_word(i, 1, 29) if r1: return r1 r1 = es_number_word(i, 30, 90) if r1: v1, i1 = r1 r2 = es_cte(i1, "y") if r2: v2, i2 = r2 r3 = es_number_word(i2, 1, 9) if r3: v3, i3 = r3 return v1 + v3, i3 return r1 return None def es_number_1_999(i): # [2-9]cientos [1-99]? r1 = es_number_word(i, 100, 900) if r1: v1, i1 = r1 r2 = es_number_1_99(i1) if r2: v2, i2 = r2 return v1 + v2, i2 else: return r1 # [1-99] r1 = es_number_1_99(i) if r1: return r1 return None def es_number(i): # check for cero r1 = es_number_word(i, 0, 0) if r1: return r1 # check for [1-999] (mil [0-999])? r1 = es_number_1_999(i) if r1: v1, i1 = r1 r2 = es_cte(i1, "mil") if r2: v2, i2 = r2 r3 = es_number_1_999(i2) if r3: v3, i3 = r3 return v1 * 1000 + v3, i3 else: return v1 * 1000, i2 else: return r1 return None return es_number(i) def normalize_es(text, remove_articles): """ Spanish string normalization """ words = text.split() # this also removed extra spaces normalized = "" i = 0 while i < len(words): word = words[i] if remove_articles and word in es_articles: i += 1 continue # Convert numbers into digits r = es_parse(words, i) if r: v, i = r normalized += " " + str(v) continue normalized += " " + word i += 1 return normalized[1:] # strip the initial space