Doc-Reader-and-Chat

Sleeping

App Files Files Community

KingNish commited on Sep 18

Commit

217892e

•

1 Parent(s): 2de7f04

Create app.py

Browse files

Files changed (1) hide show

app.py +121 -0

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import PyPDF2
+from openpyxl import load_workbook
+from pptx import Presentation
+import gradio as gr
+import io
+import docx2python
+from huggingface_hub import InferenceClient
+# Initialize the Mistral chat model
+client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
+def read_document(file):
+    file_path = file.name  # Get the file path from NamedString
+    file_extension = file_path.split('.')[-1].lower()
+    with open(file_path, "rb") as f:  # Open the file in binary read mode
+        file_content = f.read()
+    if file_extension == 'pdf':
+        try:
+            pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
+            content = ''
+            for page in range(len(pdf_reader.pages)):
+                content += pdf_reader.pages[page].extract_text()
+            return content
+        except Exception as e:
+            return f"Error reading PDF: {e}"
+    elif file_extension == 'xlsx':
+        try:
+            wb = load_workbook(io.BytesIO(file_content))
+            content = ''
+            for sheet in wb.worksheets:
+                for row in sheet.rows:
+                    for cell in row:
+                        content += str(cell.value) + ' '
+            return content
+        except Exception as e:
+            return f"Error reading XLSX: {e}"
+    elif file_extension == 'pptx':
+        try:
+            presentation = Presentation(io.BytesIO(file_content))
+            content = ''
+            for slide in presentation.slides:
+                for shape in slide.shapes:
+                    if hasattr(shape, "text"):
+                        content += shape.text + ' '
+            return content
+        except Exception as e:
+            return f"Error reading PPTX: {e}"
+    elif file_extension == 'doc' or file_extension == 'docx':
+        try:
+            doc_result = docx2python.convert(io.BytesIO(file_content))
+            content = ''
+            for page in doc_result:
+                for paragraph in page:
+                    if isinstance(paragraph, str):
+                        content += paragraph + ' '
+                    elif isinstance(paragraph, list):
+                        for sub_paragraph in paragraph:
+                            if isinstance(sub_paragraph, str):
+                                content += sub_paragraph + ' '
+            return content
+        except Exception as e:
+            return f"Error reading DOC/DOCX: {e}"
+    else:
+        try:
+            content = file_content.decode('utf-8')
+            return content
+        except Exception as e:
+            return f"Error reading file: {e}"
+def chat_document(file, question):
+    content = str(read_document(file))
+    if len(content) > 128000:
+        content = content[:128000]
+    # Define system prompt for the chat API
+    system_prompt = """
+    You are a helpful and informative assistant that can answer questions based on the content of documents.
+    You will receive the content of a document and a question about it.
+    Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
+    If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
+    """
+    message = f"""[INST] [SYSTEM] {system_prompt}
+    Document Content: {content}
+    Question: {question}
+    Answer:"""
+    stream = client.text_generation(message, max_new_tokens=512, stream=True, details=True, return_full_text=False)
+    output = ""
+    for response in stream:
+        output += response.token.text
+    return output
+with gr.Blocks() as demo:
+    with gr.Tabs():
+        with gr.TabItem("Document Reader"):
+            iface1 = gr.Interface(
+                fn=read_document,
+                inputs=gr.File(label="Upload a Document"),
+                outputs=gr.Textbox(label="Document Content"),
+                title="Document Reader",
+                description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
+            )
+        with gr.TabItem("Document Chat"):
+            iface2 = gr.Interface(
+                fn=chat_document,
+                inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question")],
+                outputs=gr.Textbox(label="Answer"),
+                title="Document Chat",
+                description="Upload a document and ask questions about its content."
+            )
+demo.launch()