Spaces:
Sleeping
Sleeping
import PyPDF2 | |
from openpyxl import load_workbook | |
from pptx import Presentation | |
import gradio as gr | |
import io | |
import docx2python | |
from huggingface_hub import InferenceClient | |
# Initialize the Mistral chat model | |
client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407") | |
def read_document(file): | |
file_path = file.name # Get the file path from NamedString | |
file_extension = file_path.split('.')[-1].lower() | |
with open(file_path, "rb") as f: # Open the file in binary read mode | |
file_content = f.read() | |
if file_extension == 'pdf': | |
try: | |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) | |
content = '' | |
for page in range(len(pdf_reader.pages)): | |
content += pdf_reader.pages[page].extract_text() | |
return content | |
except Exception as e: | |
return f"Error reading PDF: {e}" | |
elif file_extension == 'xlsx': | |
try: | |
wb = load_workbook(io.BytesIO(file_content)) | |
content = '' | |
for sheet in wb.worksheets: | |
for row in sheet.rows: | |
for cell in row: | |
content += str(cell.value) + ' ' | |
return content | |
except Exception as e: | |
return f"Error reading XLSX: {e}" | |
elif file_extension == 'pptx': | |
try: | |
presentation = Presentation(io.BytesIO(file_content)) | |
content = '' | |
for slide in presentation.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text"): | |
content += shape.text + ' ' | |
return content | |
except Exception as e: | |
return f"Error reading PPTX: {e}" | |
elif file_extension == 'doc' or file_extension == 'docx': | |
try: | |
doc_result = docx2python.convert(io.BytesIO(file_content)) | |
content = '' | |
for page in doc_result: | |
for paragraph in page: | |
if isinstance(paragraph, str): | |
content += paragraph + ' ' | |
elif isinstance(paragraph, list): | |
for sub_paragraph in paragraph: | |
if isinstance(sub_paragraph, str): | |
content += sub_paragraph + ' ' | |
return content | |
except Exception as e: | |
return f"Error reading DOC/DOCX: {e}" | |
else: | |
try: | |
content = file_content.decode('utf-8') | |
return content | |
except Exception as e: | |
return f"Error reading file: {e}" | |
def chat_document(file, question): | |
content = str(read_document(file)) | |
if len(content) > 128000: | |
content = content[:128000] | |
# Define system prompt for the chat API | |
system_prompt = """ | |
You are a helpful and informative assistant that can answer questions based on the content of documents. | |
You will receive the content of a document and a question about it. | |
Your task is to provide a concise and accurate answer to the question based solely on the provided document content. | |
If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information. | |
""" | |
message = f"""[INST] [SYSTEM] {system_prompt} | |
Document Content: {content} | |
Question: {question} | |
Answer:""" | |
stream = client.text_generation(message, max_new_tokens=512, stream=True, details=True, return_full_text=False) | |
output = "" | |
for response in stream: | |
output += response.token.text | |
return output | |
with gr.Blocks() as demo: | |
with gr.Tabs(): | |
with gr.TabItem("Document Reader"): | |
iface1 = gr.Interface( | |
fn=read_document, | |
inputs=gr.File(label="Upload a Document"), | |
outputs=gr.Textbox(label="Document Content"), | |
title="Document Reader", | |
description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content." | |
) | |
with gr.TabItem("Document Chat"): | |
iface2 = gr.Interface( | |
fn=chat_document, | |
inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question")], | |
outputs=gr.Textbox(label="Answer"), | |
title="Document Chat", | |
description="Upload a document and ask questions about its content." | |
) | |
demo.launch() |