934 lines
33 KiB
Python
934 lines
33 KiB
Python
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright 2017 Mycroft AI Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
from datetime import datetime
|
|
from dateutil.relativedelta import relativedelta
|
|
from mycroft.util.lang.parse_common import is_numeric, look_for_fractions, \
|
|
extract_numbers_generic
|
|
from mycroft.util.lang.format_da import pronounce_number_da
|
|
|
|
da_numbers = {
|
|
'nul': 0,
|
|
'en': 1,
|
|
'et': 1,
|
|
'to': 2,
|
|
'tre': 3,
|
|
'fire': 4,
|
|
'fem': 5,
|
|
'seks': 6,
|
|
'syv': 7,
|
|
'otte': 8,
|
|
'ni': 9,
|
|
'ti': 10,
|
|
'elve': 11,
|
|
'tolv': 12,
|
|
'tretten': 13,
|
|
'fjorten': 14,
|
|
'femten': 15,
|
|
'seksten': 16,
|
|
'sytten': 17,
|
|
'atten': 18,
|
|
'nitten': 19,
|
|
'tyve': 20,
|
|
'enogtyve': 21,
|
|
'toogtyve': 22,
|
|
'treogtyve': 23,
|
|
'fireogtyve': 24,
|
|
'femogtyve': 25,
|
|
'seksogtyve': 26,
|
|
'syvogtyve': 27,
|
|
'otteogtyve': 28,
|
|
'niogtyve': 29,
|
|
'tredive': 30,
|
|
'enogtredive': 31,
|
|
'fyrrre': 40,
|
|
'halvtres': 50,
|
|
'tres': 60,
|
|
'halvfjers': 70,
|
|
'firs': 80,
|
|
'halvfems': 90,
|
|
'hunderede': 100,
|
|
'tohundrede': 200,
|
|
'trehundrede': 300,
|
|
'firehundrede': 400,
|
|
'femhundrede': 500,
|
|
'sekshundrede': 600,
|
|
'syvhundrede': 700,
|
|
'ottehundrede': 800,
|
|
'nihundrede': 900,
|
|
'tusinde': 1000,
|
|
'million': 1000000
|
|
}
|
|
|
|
|
|
def extractnumber_da(text):
|
|
"""
|
|
This function prepares the given text for parsing by making
|
|
numbers consistent, getting rid of contractions, etc.
|
|
Args:
|
|
text (str): the string to normalize
|
|
Returns:
|
|
(int) or (float): The value of extracted number
|
|
|
|
|
|
undefined articles cannot be suppressed in German:
|
|
'ein Pferd' means 'one horse' and 'a horse'
|
|
|
|
"""
|
|
aWords = text.split()
|
|
aWords = [word for word in aWords if
|
|
word not in ["den", "det"]]
|
|
and_pass = False
|
|
valPreAnd = False
|
|
val = False
|
|
count = 0
|
|
while count < len(aWords):
|
|
word = aWords[count]
|
|
if is_numeric(word):
|
|
if word.isdigit(): # doesn't work with decimals
|
|
val = float(word)
|
|
elif isFractional_da(word):
|
|
val = isFractional_da(word)
|
|
elif isOrdinal_da(word):
|
|
val = isOrdinal_da(word)
|
|
else:
|
|
if word in da_numbers:
|
|
val = da_numbers[word]
|
|
if count < (len(aWords) - 1):
|
|
wordNext = aWords[count + 1]
|
|
else:
|
|
wordNext = ""
|
|
valNext = isFractional_da(wordNext)
|
|
|
|
if valNext:
|
|
val = val * valNext
|
|
aWords[count + 1] = ""
|
|
|
|
if not val:
|
|
# look for fractions like "2/3"
|
|
aPieces = word.split('/')
|
|
# if (len(aPieces) == 2 and is_numeric(aPieces[0])
|
|
# and is_numeric(aPieces[1])):
|
|
if look_for_fractions(aPieces):
|
|
val = float(aPieces[0]) / float(aPieces[1])
|
|
elif and_pass:
|
|
# added to value, quit here
|
|
val = valPreAnd
|
|
break
|
|
else:
|
|
count += 1
|
|
continue
|
|
|
|
aWords[count] = ""
|
|
|
|
if and_pass:
|
|
aWords[count - 1] = '' # remove "og"
|
|
val += valPreAnd
|
|
elif count + 1 < len(aWords) and aWords[count + 1] == 'og':
|
|
and_pass = True
|
|
valPreAnd = val
|
|
val = False
|
|
count += 2
|
|
continue
|
|
elif count + 2 < len(aWords) and aWords[count + 2] == 'og':
|
|
and_pass = True
|
|
valPreAnd = val
|
|
val = False
|
|
count += 3
|
|
continue
|
|
|
|
break
|
|
|
|
if not val:
|
|
return False
|
|
|
|
return val
|
|
|
|
|
|
def extract_datetime_da(string, currentDate, default_time):
|
|
def clean_string(s):
|
|
"""
|
|
cleans the input string of unneeded punctuation
|
|
and capitalization among other things.
|
|
|
|
'am' is a preposition, so cannot currently be used
|
|
for 12 hour date format
|
|
"""
|
|
|
|
s = s.lower().replace('?', '').replace('.', '').replace(',', '') \
|
|
.replace(' den ', ' ').replace(' det ', ' ').replace(' om ',
|
|
' ').replace(
|
|
' om ', ' ') \
|
|
.replace(' på ', ' ').replace(' om ', ' ')
|
|
wordList = s.split()
|
|
|
|
for idx, word in enumerate(wordList):
|
|
if isOrdinal_da(word) is not False:
|
|
word = str(isOrdinal_da(word))
|
|
wordList[idx] = word
|
|
|
|
return wordList
|
|
|
|
def date_found():
|
|
return found or \
|
|
(
|
|
datestr != "" or timeStr != "" or
|
|
yearOffset != 0 or monthOffset != 0 or
|
|
dayOffset is True or hrOffset != 0 or
|
|
hrAbs or minOffset != 0 or
|
|
minAbs or secOffset != 0
|
|
)
|
|
|
|
if string == "" or not currentDate:
|
|
return None
|
|
|
|
found = False
|
|
daySpecified = False
|
|
dayOffset = False
|
|
monthOffset = 0
|
|
yearOffset = 0
|
|
dateNow = currentDate
|
|
today = dateNow.strftime("%w")
|
|
currentYear = dateNow.strftime("%Y")
|
|
fromFlag = False
|
|
datestr = ""
|
|
hasYear = False
|
|
timeQualifier = ""
|
|
|
|
timeQualifiersList = ['tidlig',
|
|
'morgen',
|
|
'morgenen',
|
|
'formidag',
|
|
'formiddagen',
|
|
'eftermiddag',
|
|
'eftermiddagen',
|
|
'aften',
|
|
'aftenen',
|
|
'nat',
|
|
'natten']
|
|
markers = ['i', 'om', 'på', 'klokken', 'ved']
|
|
days = ['mandag', 'tirsdag', 'onsdag',
|
|
'torsdag', 'fredag', 'lørdag', 'søndag']
|
|
months = ['januar', 'februar', 'marts', 'april', 'maj', 'juni',
|
|
'juli', 'august', 'september', 'oktober', 'november',
|
|
'desember']
|
|
monthsShort = ['jan', 'feb', 'mar', 'apr', 'maj', 'juni', 'juli', 'aug',
|
|
'sep', 'okt', 'nov', 'des']
|
|
|
|
validFollowups = days + months + monthsShort
|
|
validFollowups.append("i dag")
|
|
validFollowups.append("morgen")
|
|
validFollowups.append("næste")
|
|
validFollowups.append("forige")
|
|
validFollowups.append("nu")
|
|
|
|
words = clean_string(string)
|
|
|
|
for idx, word in enumerate(words):
|
|
if word == "":
|
|
continue
|
|
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
|
wordPrev = words[idx - 1] if idx > 0 else ""
|
|
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
|
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
|
|
|
start = idx
|
|
used = 0
|
|
# save timequalifier for later
|
|
if word in timeQualifiersList:
|
|
timeQualifier = word
|
|
# parse today, tomorrow, day after tomorrow
|
|
elif word == "dag" and not fromFlag:
|
|
dayOffset = 0
|
|
used += 1
|
|
elif word == "morgen" and not fromFlag and wordPrev != "om" and \
|
|
wordPrev not in days: # morgen means tomorrow if not "am
|
|
# Morgen" and not [day of the week] morgen
|
|
dayOffset = 1
|
|
used += 1
|
|
elif word == "overmorgen" and not fromFlag:
|
|
dayOffset = 2
|
|
used += 1
|
|
# parse 5 days, 10 weeks, last week, next week
|
|
elif word == "dag" or word == "dage":
|
|
if wordPrev[0].isdigit():
|
|
dayOffset += int(wordPrev)
|
|
start -= 1
|
|
used = 2
|
|
elif word == "uge" or word == "uger" and not fromFlag:
|
|
if wordPrev[0].isdigit():
|
|
dayOffset += int(wordPrev) * 7
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev[:6] == "næste":
|
|
dayOffset = 7
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev[:5] == "forige":
|
|
dayOffset = -7
|
|
start -= 1
|
|
used = 2
|
|
# parse 10 months, next month, last month
|
|
elif word == "måned" and not fromFlag:
|
|
if wordPrev[0].isdigit():
|
|
monthOffset = int(wordPrev)
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev[:6] == "næste":
|
|
monthOffset = 1
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev[:5] == "forige":
|
|
monthOffset = -1
|
|
start -= 1
|
|
used = 2
|
|
# parse 5 years, next year, last year
|
|
elif word == "år" and not fromFlag:
|
|
if wordPrev[0].isdigit():
|
|
yearOffset = int(wordPrev)
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev[:6] == " næste":
|
|
yearOffset = 1
|
|
start -= 1
|
|
used = 2
|
|
elif wordPrev[:6] == "næste":
|
|
yearOffset = -1
|
|
start -= 1
|
|
used = 2
|
|
# parse Monday, Tuesday, etc., and next Monday,
|
|
# last Tuesday, etc.
|
|
elif word in days and not fromFlag:
|
|
d = days.index(word)
|
|
dayOffset = (d + 1) - int(today)
|
|
used = 1
|
|
if dayOffset < 0:
|
|
dayOffset += 7
|
|
if wordNext == "morgen":
|
|
# morgen means morning if preceded by
|
|
# the day of the week
|
|
words[idx + 1] = "tidlig"
|
|
if wordPrev[:6] == "næste":
|
|
dayOffset += 7
|
|
used += 1
|
|
start -= 1
|
|
elif wordPrev[:5] == "forige":
|
|
dayOffset -= 7
|
|
used += 1
|
|
start -= 1
|
|
# parse 15 of July, June 20th, Feb 18, 19 of February
|
|
elif word in months or word in monthsShort and not fromFlag:
|
|
try:
|
|
m = months.index(word)
|
|
except ValueError:
|
|
m = monthsShort.index(word)
|
|
used += 1
|
|
datestr = months[m]
|
|
if wordPrev and (wordPrev[0].isdigit() or
|
|
(wordPrev == "of" and wordPrevPrev[0].isdigit())):
|
|
if wordPrev == "of" and wordPrevPrev[0].isdigit():
|
|
datestr += " " + words[idx - 2]
|
|
used += 1
|
|
start -= 1
|
|
else:
|
|
datestr += " " + wordPrev
|
|
start -= 1
|
|
used += 1
|
|
if wordNext and wordNext[0].isdigit():
|
|
datestr += " " + wordNext
|
|
used += 1
|
|
hasYear = True
|
|
else:
|
|
hasYear = False
|
|
|
|
elif wordNext and wordNext[0].isdigit():
|
|
datestr += " " + wordNext
|
|
used += 1
|
|
if wordNextNext and wordNextNext[0].isdigit():
|
|
datestr += " " + wordNextNext
|
|
used += 1
|
|
hasYear = True
|
|
else:
|
|
hasYear = False
|
|
# parse 5 days from tomorrow, 10 weeks from next thursday,
|
|
# 2 months from July
|
|
|
|
if (
|
|
word == "fra" or word == "til" or word == "om") and wordNext \
|
|
in validFollowups:
|
|
used = 2
|
|
fromFlag = True
|
|
if wordNext == "morgenen" and \
|
|
wordPrev != "om" and \
|
|
wordPrev not in days:
|
|
# morgen means tomorrow if not "am Morgen" and not
|
|
# [day of the week] morgen:
|
|
dayOffset += 1
|
|
elif wordNext in days:
|
|
d = days.index(wordNext)
|
|
tmpOffset = (d + 1) - int(today)
|
|
used = 2
|
|
if tmpOffset < 0:
|
|
tmpOffset += 7
|
|
dayOffset += tmpOffset
|
|
elif wordNextNext and wordNextNext in days:
|
|
d = days.index(wordNextNext)
|
|
tmpOffset = (d + 1) - int(today)
|
|
used = 3
|
|
if wordNext[:6] == "næste":
|
|
tmpOffset += 7
|
|
used += 1
|
|
start -= 1
|
|
elif wordNext[:5] == "forige":
|
|
tmpOffset -= 7
|
|
used += 1
|
|
start -= 1
|
|
dayOffset += tmpOffset
|
|
if used > 0:
|
|
if start - 1 > 0 and words[start - 1].startswith("denne"):
|
|
start -= 1
|
|
used += 1
|
|
|
|
for i in range(0, used):
|
|
words[i + start] = ""
|
|
|
|
if start - 1 >= 0 and words[start - 1] in markers:
|
|
words[start - 1] = ""
|
|
found = True
|
|
daySpecified = True
|
|
|
|
# parse time
|
|
timeStr = ""
|
|
hrOffset = 0
|
|
minOffset = 0
|
|
secOffset = 0
|
|
hrAbs = None
|
|
minAbs = None
|
|
|
|
for idx, word in enumerate(words):
|
|
if word == "":
|
|
continue
|
|
|
|
wordPrevPrev = words[idx - 2] if idx > 1 else ""
|
|
wordPrev = words[idx - 1] if idx > 0 else ""
|
|
wordNext = words[idx + 1] if idx + 1 < len(words) else ""
|
|
wordNextNext = words[idx + 2] if idx + 2 < len(words) else ""
|
|
wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else ""
|
|
wordNextNextNextNext = words[idx + 4] if idx + 4 < len(words) else ""
|
|
|
|
# parse noon, midnight, morning, afternoon, evening
|
|
used = 0
|
|
if word[:6] == "middag":
|
|
hrAbs = 12
|
|
used += 1
|
|
elif word[:11] == "midnat":
|
|
hrAbs = 0
|
|
used += 1
|
|
elif word == "morgenen" or (
|
|
wordPrev == "om" and word == "morgenen") or word == "tidlig":
|
|
if not hrAbs:
|
|
hrAbs = 8
|
|
used += 1
|
|
elif word[:11] == "eftermiddag":
|
|
if not hrAbs:
|
|
hrAbs = 15
|
|
used += 1
|
|
elif word[:5] == "aften":
|
|
if not hrAbs:
|
|
hrAbs = 19
|
|
used += 1
|
|
# parse half an hour, quarter hour
|
|
elif word == "time" and \
|
|
(wordPrev in markers or wordPrevPrev in markers):
|
|
if wordPrev[:4] == "halv":
|
|
minOffset = 30
|
|
elif wordPrev == "kvarter":
|
|
minOffset = 15
|
|
elif wordPrev == "trekvarter":
|
|
minOffset = 45
|
|
else:
|
|
hrOffset = 1
|
|
if wordPrevPrev in markers:
|
|
words[idx - 2] = ""
|
|
words[idx - 1] = ""
|
|
used += 1
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
# parse 5:00 am, 12:00 p.m., etc
|
|
elif word[0].isdigit():
|
|
isTime = True
|
|
strHH = ""
|
|
strMM = ""
|
|
remainder = ""
|
|
if ':' in word:
|
|
# parse colons
|
|
# "3:00 in the morning"
|
|
stage = 0
|
|
length = len(word)
|
|
for i in range(length):
|
|
if stage == 0:
|
|
if word[i].isdigit():
|
|
strHH += word[i]
|
|
elif word[i] == ":":
|
|
stage = 1
|
|
else:
|
|
stage = 2
|
|
i -= 1
|
|
elif stage == 1:
|
|
if word[i].isdigit():
|
|
strMM += word[i]
|
|
else:
|
|
stage = 2
|
|
i -= 1
|
|
elif stage == 2:
|
|
remainder = word[i:].replace(".", "")
|
|
break
|
|
if remainder == "":
|
|
nextWord = wordNext.replace(".", "")
|
|
if nextWord == "am" or nextWord == "pm":
|
|
remainder = nextWord
|
|
used += 1
|
|
elif nextWord == "aften":
|
|
remainder = "pm"
|
|
used += 1
|
|
elif wordNext == "om" and wordNextNext == "morgenen":
|
|
remainder = "am"
|
|
used += 2
|
|
elif wordNext == "om" and wordNextNext == "eftermiddagen":
|
|
remainder = "pm"
|
|
used += 2
|
|
elif wordNext == "om" and wordNextNext == "aftenen":
|
|
remainder = "pm"
|
|
used += 2
|
|
elif wordNext == "morgen":
|
|
remainder = "am"
|
|
used += 1
|
|
elif wordNext == "eftermiddag":
|
|
remainder = "pm"
|
|
used += 1
|
|
elif wordNext == "aften":
|
|
remainder = "pm"
|
|
used += 1
|
|
elif wordNext == "i" and wordNextNext == "morgen":
|
|
remainder = "am"
|
|
used = 2
|
|
elif wordNext == "i" and wordNextNext == "eftermiddag":
|
|
remainder = "pm"
|
|
used = 2
|
|
elif wordNext == "i" and wordNextNext == "aften":
|
|
remainder = "pm"
|
|
used = 2
|
|
elif wordNext == "natten":
|
|
if strHH > 4:
|
|
remainder = "pm"
|
|
else:
|
|
remainder = "am"
|
|
used += 1
|
|
else:
|
|
if timeQualifier != "":
|
|
if strHH <= 12 and \
|
|
(timeQualifier == "aftenen" or
|
|
timeQualifier == "eftermiddagen"):
|
|
strHH += 12 # what happens when strHH is 24?
|
|
else:
|
|
# try to parse # s without colons
|
|
# 5 hours, 10 minutes etc.
|
|
length = len(word)
|
|
strNum = ""
|
|
remainder = ""
|
|
for i in range(length):
|
|
if word[i].isdigit():
|
|
strNum += word[i]
|
|
else:
|
|
remainder += word[i]
|
|
|
|
if remainder == "":
|
|
remainder = wordNext.replace(".", "").lstrip().rstrip()
|
|
|
|
if (
|
|
remainder == "pm" or
|
|
wordNext == "pm" or
|
|
remainder == "p.m." or
|
|
wordNext == "p.m."):
|
|
strHH = strNum
|
|
remainder = "pm"
|
|
used = 1
|
|
elif (
|
|
remainder == "am" or
|
|
wordNext == "am" or
|
|
remainder == "a.m." or
|
|
wordNext == "a.m."):
|
|
strHH = strNum
|
|
remainder = "am"
|
|
used = 1
|
|
else:
|
|
if wordNext == "time" and int(word) < 100:
|
|
# "in 3 hours"
|
|
hrOffset = int(word)
|
|
used = 2
|
|
isTime = False
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
elif wordNext == "minut":
|
|
# "in 10 minutes"
|
|
minOffset = int(word)
|
|
used = 2
|
|
isTime = False
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
elif wordNext == "sekund":
|
|
# in 5 seconds
|
|
secOffset = int(word)
|
|
used = 2
|
|
isTime = False
|
|
hrAbs = -1
|
|
minAbs = -1
|
|
|
|
elif wordNext == "time":
|
|
strHH = word
|
|
used += 1
|
|
isTime = True
|
|
if wordNextNext == timeQualifier:
|
|
strMM = ""
|
|
if wordNextNext[:11] == "eftermiddag":
|
|
used += 1
|
|
remainder = "pm"
|
|
elif wordNextNext == "om" and wordNextNextNext == \
|
|
"eftermiddagen":
|
|
used += 2
|
|
remainder = "pm"
|
|
elif wordNextNext[:5] == "aften":
|
|
used += 1
|
|
remainder = "pm"
|
|
elif wordNextNext == "om" and wordNextNextNext == \
|
|
"aftenen":
|
|
used += 2
|
|
remainder = "pm"
|
|
elif wordNextNext[:6] == "morgen":
|
|
used += 1
|
|
remainder = "am"
|
|
elif wordNextNext == "om" and wordNextNextNext == \
|
|
"morgenen":
|
|
used += 2
|
|
remainder = "am"
|
|
elif wordNextNext == "natten":
|
|
used += 1
|
|
if 8 <= int(word) <= 12:
|
|
remainder = "pm"
|
|
else:
|
|
remainder = "am"
|
|
|
|
elif is_numeric(wordNextNext):
|
|
strMM = wordNextNext
|
|
used += 1
|
|
if wordNextNextNext == timeQualifier:
|
|
if wordNextNextNext[:11] == "eftermiddag":
|
|
used += 1
|
|
remainder = "pm"
|
|
elif wordNextNextNext == "om" and \
|
|
wordNextNextNextNext == \
|
|
"eftermiddagen":
|
|
used += 2
|
|
remainder = "pm"
|
|
elif wordNextNextNext[:6] == "natten":
|
|
used += 1
|
|
remainder = "pm"
|
|
elif wordNextNextNext == "am" and \
|
|
wordNextNextNextNext == "natten":
|
|
used += 2
|
|
remainder = "pm"
|
|
elif wordNextNextNext[:7] == "morgenen":
|
|
used += 1
|
|
remainder = "am"
|
|
elif wordNextNextNext == "om" and \
|
|
wordNextNextNextNext == "morgenen":
|
|
used += 2
|
|
remainder = "am"
|
|
elif wordNextNextNext == "natten":
|
|
used += 1
|
|
if 8 <= int(word) <= 12:
|
|
remainder = "pm"
|
|
else:
|
|
remainder = "am"
|
|
|
|
elif wordNext == timeQualifier:
|
|
strHH = word
|
|
strMM = 00
|
|
isTime = True
|
|
if wordNext[:10] == "eftermidag":
|
|
used += 1
|
|
remainder = "pm"
|
|
elif wordNext == "om" and \
|
|
wordNextNext == "eftermiddanen":
|
|
used += 2
|
|
remainder = "pm"
|
|
elif wordNext[:7] == "aftenen":
|
|
used += 1
|
|
remainder = "pm"
|
|
elif wordNext == "om" and wordNextNext == "aftenen":
|
|
used += 2
|
|
remainder = "pm"
|
|
elif wordNext[:7] == "morgenen":
|
|
used += 1
|
|
remainder = "am"
|
|
elif wordNext == "ao" and wordNextNext == "morgenen":
|
|
used += 2
|
|
remainder = "am"
|
|
elif wordNext == "natten":
|
|
used += 1
|
|
if 8 <= int(word) <= 12:
|
|
remainder = "pm"
|
|
else:
|
|
remainder = "am"
|
|
|
|
# if timeQualifier != "":
|
|
# military = True
|
|
# else:
|
|
# isTime = False
|
|
|
|
strHH = int(strHH) if strHH else 0
|
|
strMM = int(strMM) if strMM else 0
|
|
strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH
|
|
strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH
|
|
if strHH > 24 or strMM > 59:
|
|
isTime = False
|
|
used = 0
|
|
if isTime:
|
|
hrAbs = strHH * 1
|
|
minAbs = strMM * 1
|
|
used += 1
|
|
if used > 0:
|
|
# removed parsed words from the sentence
|
|
for i in range(used):
|
|
words[idx + i] = ""
|
|
|
|
if wordPrev == "tidlig":
|
|
hrOffset = -1
|
|
words[idx - 1] = ""
|
|
idx -= 1
|
|
elif wordPrev == "sen":
|
|
hrOffset = 1
|
|
words[idx - 1] = ""
|
|
idx -= 1
|
|
if idx > 0 and wordPrev in markers:
|
|
words[idx - 1] = ""
|
|
if idx > 1 and wordPrevPrev in markers:
|
|
words[idx - 2] = ""
|
|
|
|
idx += used - 1
|
|
found = True
|
|
|
|
# check that we found a date
|
|
if not date_found:
|
|
return None
|
|
|
|
if dayOffset is False:
|
|
dayOffset = 0
|
|
|
|
# perform date manipulation
|
|
|
|
extractedDate = dateNow
|
|
extractedDate = extractedDate.replace(microsecond=0,
|
|
second=0,
|
|
minute=0,
|
|
hour=0)
|
|
if datestr != "":
|
|
en_months = ['january', 'february', 'march', 'april', 'may', 'june',
|
|
'july', 'august', 'september', 'october', 'november',
|
|
'december']
|
|
en_monthsShort = ['jan', 'feb', 'mar', 'apr', 'may', 'june', 'july',
|
|
'aug',
|
|
'sept', 'oct', 'nov', 'dec']
|
|
for idx, en_month in enumerate(en_months):
|
|
datestr = datestr.replace(months[idx], en_month)
|
|
for idx, en_month in enumerate(en_monthsShort):
|
|
datestr = datestr.replace(monthsShort[idx], en_month)
|
|
|
|
temp = datetime.strptime(datestr, "%B %d")
|
|
if not hasYear:
|
|
temp = temp.replace(year=extractedDate.year)
|
|
if extractedDate < temp:
|
|
extractedDate = extractedDate.replace(year=int(currentYear),
|
|
month=int(
|
|
temp.strftime(
|
|
"%m")),
|
|
day=int(temp.strftime(
|
|
"%d")))
|
|
else:
|
|
extractedDate = extractedDate.replace(
|
|
year=int(currentYear) + 1,
|
|
month=int(temp.strftime("%m")),
|
|
day=int(temp.strftime("%d")))
|
|
else:
|
|
extractedDate = extractedDate.replace(
|
|
year=int(temp.strftime("%Y")),
|
|
month=int(temp.strftime("%m")),
|
|
day=int(temp.strftime("%d")))
|
|
|
|
if timeStr != "":
|
|
temp = datetime(timeStr)
|
|
extractedDate = extractedDate.replace(hour=temp.strftime("%H"),
|
|
minute=temp.strftime("%M"),
|
|
second=temp.strftime("%S"))
|
|
|
|
if yearOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(years=yearOffset)
|
|
if monthOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(months=monthOffset)
|
|
if dayOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(days=dayOffset)
|
|
|
|
if hrAbs is None and minAbs is None and default_time:
|
|
hrAbs = default_time.hour
|
|
minAbs = default_time.minute
|
|
|
|
if hrAbs != -1 and minAbs != -1:
|
|
|
|
extractedDate = extractedDate + relativedelta(hours=hrAbs or 0,
|
|
minutes=minAbs or 0)
|
|
if (hrAbs or minAbs) and datestr == "":
|
|
if not daySpecified and dateNow > extractedDate:
|
|
extractedDate = extractedDate + relativedelta(days=1)
|
|
if hrOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(hours=hrOffset)
|
|
if minOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(minutes=minOffset)
|
|
if secOffset != 0:
|
|
extractedDate = extractedDate + relativedelta(seconds=secOffset)
|
|
for idx, word in enumerate(words):
|
|
if words[idx] == "og" and words[idx - 1] == "" \
|
|
and words[idx + 1] == "":
|
|
words[idx] = ""
|
|
|
|
resultStr = " ".join(words)
|
|
resultStr = ' '.join(resultStr.split())
|
|
|
|
return [extractedDate, resultStr]
|
|
|
|
|
|
def isFractional_da(input_str):
|
|
"""
|
|
This function takes the given text and checks if it is a fraction.
|
|
|
|
Args:
|
|
input_str (str): the string to check if fractional
|
|
Returns:
|
|
(bool) or (float): False if not a fraction, otherwise the fraction
|
|
|
|
"""
|
|
if input_str.lower().startswith("halv"):
|
|
return 0.5
|
|
|
|
if input_str.lower() == "trediedel":
|
|
return 1.0 / 3
|
|
elif input_str.endswith('del'):
|
|
input_str = input_str[:len(input_str) - 3] # e.g. "fünftel"
|
|
if input_str.lower() in da_numbers:
|
|
return 1.0 / (da_numbers[input_str.lower()])
|
|
|
|
return False
|
|
|
|
|
|
def isOrdinal_da(input_str):
|
|
"""
|
|
This function takes the given text and checks if it is an ordinal number.
|
|
|
|
Args:
|
|
input_str (str): the string to check if ordinal
|
|
Returns:
|
|
(bool) or (float): False if not an ordinal, otherwise the number
|
|
corresponding to the ordinal
|
|
|
|
ordinals for 1, 3, 7 and 8 are irregular
|
|
|
|
only works for ordinals corresponding to the numbers in da_numbers
|
|
|
|
"""
|
|
|
|
lowerstr = input_str.lower()
|
|
|
|
if lowerstr.startswith("første"):
|
|
return 1
|
|
if lowerstr.startswith("anden"):
|
|
return 2
|
|
if lowerstr.startswith("tredie"):
|
|
return 3
|
|
if lowerstr.startswith("fjerde"):
|
|
return 4
|
|
if lowerstr.startswith("femte"):
|
|
return 5
|
|
if lowerstr.startswith("sjette"):
|
|
return 6
|
|
if lowerstr.startswith("elfte"):
|
|
return 1
|
|
if lowerstr.startswith("tolvfte"):
|
|
return 12
|
|
|
|
if lowerstr[-3:] == "nde":
|
|
# from 20 suffix is -ste*
|
|
lowerstr = lowerstr[:-3]
|
|
if lowerstr in da_numbers:
|
|
return da_numbers[lowerstr]
|
|
|
|
if lowerstr[-4:] in ["ende"]:
|
|
lowerstr = lowerstr[:-4]
|
|
if lowerstr in da_numbers:
|
|
return da_numbers[lowerstr]
|
|
|
|
if lowerstr[-2:] == "te": # below 20 suffix is -te*
|
|
lowerstr = lowerstr[:-2]
|
|
if lowerstr in da_numbers:
|
|
return da_numbers[lowerstr]
|
|
|
|
return False
|
|
|
|
|
|
def normalize_da(text, remove_articles):
|
|
""" German string normalization """
|
|
|
|
words = text.split() # this also removed extra spaces
|
|
normalized = ""
|
|
for word in words:
|
|
if remove_articles and word in ["den", "det"]:
|
|
continue
|
|
|
|
# Convert numbers into digits, e.g. "two" -> "2"
|
|
|
|
if word in da_numbers:
|
|
word = str(da_numbers[word])
|
|
|
|
normalized += " " + word
|
|
|
|
return normalized[1:] # strip the initial space
|
|
|
|
|
|
def extract_numbers_da(text, short_scale=True, ordinals=False):
|
|
"""
|
|
Takes in a string and extracts a list of numbers.
|
|
|
|
Args:
|
|
text (str): the string to extract a number from
|
|
short_scale (bool): Use "short scale" or "long scale" for large
|
|
numbers -- over a million. The default is short scale, which
|
|
is now common in most English speaking countries.
|
|
See https://en.wikipedia.org/wiki/Names_of_large_numbers
|
|
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
|
|
Returns:
|
|
list: list of extracted numbers as floats
|
|
"""
|
|
return extract_numbers_generic(text, pronounce_number_da, extractnumber_da,
|
|
short_scale=short_scale, ordinals=ordinals)
|