changes to mycroft core to do longer sequence generations

pull/1795/head
Michael Nguyen 2018-09-06 15:05:00 -05:00
parent 3c93fea807
commit 7b9ed611aa
1 changed files with 55 additions and 46 deletions

View File

@ -28,6 +28,9 @@ import hashlib
import re import re
max_sentence_size = 170
def break_chunks(l, n): def break_chunks(l, n):
"""Yield successive n-sized chunks from l.""" """Yield successive n-sized chunks from l."""
for i in range(0, len(l), n): for i in range(0, len(l), n):
@ -71,41 +74,27 @@ def split_by_chunk_size(text, chunk_size):
)) ))
def split_by_punctuation(text, chunk_size): def split_by_punctuation(text, puncs=[]):
"""split text by punctuations splits = text.split()
i.e "hello, world" -> ["hello", "world"] split_by_punc = False
for punc in puncs:
Args:
text (str): text to split
chunk_size (int): size of each chunk
Returns:
list: list of sentence chunk
"""
punctuations = [',', '.', '-', '?', '!', ':', ';']
text_list = text.split()
splits = None
if len(text_list) >= chunk_size:
for punc in punctuations:
if punc in text: if punc in text:
splits = text.split(punc) splits = text.split(punc)
split_by_punc = True
break break
if split_by_punc:
# TODO: check if splits are to small, combined them
return splits return splits
else:
return [text]
def add_punctuation(text): def add_punctuation(text):
"""add punctuation at the end of each chunk. Mimic2 """add punctuation at the end of each chunk. Mimic2
expects a form of punctuation expects some form of punctuations
""" """
punctuations = ['.', '?', '!'] punctuations = ['.', '?', '!']
if len(text) < 1: if len(text) < 1:
return text return text
if len(text) < 10:
if text[-1] in punctuations:
if text[-1] != ".":
return text[:-1] + "."
if text[-1] not in punctuations: if text[-1] not in punctuations:
text += '.' text += '.'
return text return text
@ -124,30 +113,50 @@ def sentence_chunker(text, chunk_size, split_by_punc=True):
Returns: Returns:
list: list of text chunks list: list of text chunks
""" """
text_list = text.split() if len(text) <= max_sentence_size:
# if initial text is 1.3 times chunk size, no need to split
# if the chracter count is less then 55
if len(text_list) <= chunk_size * 1.3:
if len(text) < 55:
return [add_punctuation(text)] return [add_punctuation(text)]
# split text by punctuations if split_by_punc set to true # split text by punctuations if split_by_punc set to true
punc_splits = None chunks = None
if split_by_punc: if split_by_punc:
punc_splits = split_by_punctuation(text, chunk_size) # first split by periods ending puncs
LOG.info("!")
chunks = split_by_punctuation(
text.strip(),
puncs=['.', '!', '?', ':', '-', ';']
)
# split text by chunk size # if sentence is still to big, split by other commas
chunks = [] second_splits = []
if punc_splits: did_second_split = False
for sentence in punc_splits: for sentence in chunks:
sentence = sentence.strip() if len(sentence) > max_sentence_size:
chunks += split_by_chunk_size(sentence, chunk_size) comma_splits = split_by_punctuation(
# split text by chunk size sentence.strip(), puncs=[',']
)
second_splits += comma_splits
did_second_split = True
else: else:
chunks += split_by_chunk_size(text, chunk_size) second_splits.append(sentence.strip())
if did_second_split:
chunks = second_splits
# if sentence is still to by 15 word chunks
third_splits = []
did_third_split = False
for sentence in chunks:
if len(sentence) > max_sentence_size:
chunk_split = split_by_chunk_size(sentence.strip(), 20)
third_splits += chunk_split
did_third_split = True
else:
third_splits.append(sentence.strip())
if did_third_split:
chunks = third_splits
chunks = [add_punctuation(chunk) for chunk in chunks] chunks = [add_punctuation(chunk) for chunk in chunks]
return chunks return chunks