File size: 472 Bytes
66340f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from textwrap3 import dedent
from unidecode import unidecode
import re


def chunk_text(text, max_size=4000):
    paragraphs = dedent(text)
    ascii_paragraphs = re.findall(r"[^.?!]+[(\.)?!]", unidecode(paragraphs))

    chuncks = []
    chunck = ""
    for sentence in ascii_paragraphs:
        if len(chunck) + len(sentence) < max_size:
            chunck += sentence
        else:
            chuncks.append(chunck.strip())
            chunck = ""

    return chuncks