Spaces:
Running
Running
import re | |
import pymupdf | |
from nltk.tokenize import sent_tokenize | |
from docx.api import Document | |
from pptx import Presentation | |
from bs4 import BeautifulSoup | |
import pypandoc | |
def clean_text(text): | |
text = re.sub(r"[^\x00-\x7F]", " ", text) | |
text = re.sub(r"[\n]", " ", text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
text = re.sub(r'[ββ]', '"', text) | |
text = re.sub(r"[ββ]", "'", text) | |
text = text.replace('\xad', '') | |
text = re.sub(r'[ββββ]', '-', text) | |
return text | |
# Function to create text chunks | |
def segment_text(text, max_segment_length=700, batch_size=7): | |
sentences = sent_tokenize(text) | |
segments = [] | |
current_segment = "" | |
for sentence in sentences: | |
if len(current_segment) + len(sentence) <= max_segment_length: | |
current_segment += sentence + " " | |
else: | |
segments.append(current_segment.strip()) | |
current_segment = sentence + " " | |
if current_segment: | |
segments.append(current_segment.strip()) | |
# Create batches | |
batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)] | |
return batches | |
def get_pdf_text(pdf_file): | |
doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf") | |
text = "" | |
for page_num in range(doc.page_count): | |
page = doc.load_page(page_num) | |
text += page.get_text() | |
return text | |
# Function to get text from a DOCX file | |
def get_doc_text(doc_files): | |
doc = Document(doc_files) | |
text = "" | |
for paragraph in doc.paragraphs: | |
text += paragraph.text + "\n" | |
return text | |
# Function to get text from a PPTX file | |
def get_ppt_text(ppt_files): | |
prs = Presentation(ppt_files) | |
text = "" | |
for slide in prs.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text"): | |
text += shape.text + "\n" | |
return text | |
# Function to get text from HTML files | |
def get_html_text(html_files): | |
text = "" | |
for html_file in html_files: | |
with open(html_file, 'r', encoding='utf-8') as f: | |
soup = BeautifulSoup(f, 'html.parser') | |
text += soup.get_text() | |
return text | |
# Function to get text from LaTeX files | |
def get_latex_text(latex_files): | |
text = "" | |
for latex_file in latex_files: | |
output = pypandoc.convert_file(latex_file, 'plain') | |
text += output | |
return text | |
# Function to parse text from a file | |
def parse_text(file): | |
text = file.getvalue().decode("utf-8") | |
return text | |
# Function to get text from uploaded documents | |
def get_text_from_document(file): | |
content = "" | |
if file is not None: | |
if file.name.endswith('.pdf'): | |
content += get_pdf_text(file) | |
elif file.name.endswith('.docx') or file.name.endswith('.doc'): | |
content += get_doc_text(file) | |
elif file.name.endswith('.pptx') or file.name.endswith('.ppt'): | |
content += get_ppt_text(file) | |
elif file.name.endswith('.html'): | |
content += get_html_text(file) | |
elif file.name.endswith('.tex'): | |
content += get_latex_text(file) | |
elif file.name.endswith('.txt'): | |
content += parse_text(file) | |
return content |