zinoubm's picture
initial commit
66340f1
raw
history blame contribute delete
472 Bytes
from textwrap3 import dedent
from unidecode import unidecode
import re
def chunk_text(text, max_size=4000):
paragraphs = dedent(text)
ascii_paragraphs = re.findall(r"[^.?!]+[(\.)?!]", unidecode(paragraphs))
chuncks = []
chunck = ""
for sentence in ascii_paragraphs:
if len(chunck) + len(sentence) < max_size:
chunck += sentence
else:
chuncks.append(chunck.strip())
chunck = ""
return chuncks