Spaces:
Running
Running
import re | |
import pymupdf | |
from nltk.tokenize import sent_tokenize | |
def get_pdf_text(pdf_file): | |
doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf") | |
text = "" | |
for page_num in range(doc.page_count): | |
page = doc.load_page(page_num) | |
text += page.get_text() | |
return text | |
def clean_text(text): | |
text = re.sub(r"[^\x00-\x7F]", " ", text) | |
text = re.sub(r"[\n]", " ", text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
text = re.sub(r'[ββ]', '"', text) | |
text = re.sub(r"[ββ]", "'", text) | |
text = text.replace('\xad', '') | |
text = re.sub(r'[ββββ]', '-', text) | |
return text | |
# Function to create text chunks | |
def segment_text(text, max_segment_length=700, batch_size=7): | |
sentences = sent_tokenize(text) | |
segments = [] | |
current_segment = "" | |
for sentence in sentences: | |
if len(current_segment) + len(sentence) <= max_segment_length: | |
current_segment += sentence + " " | |
else: | |
segments.append(current_segment.strip()) | |
current_segment = sentence + " " | |
if current_segment: | |
segments.append(current_segment.strip()) | |
# Create batches | |
batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)] | |
return batches |