Spaces:

Shanulhaq
/

Doctore-AI

Sleeping

App Files Files Community

Shanulhaq commited on Sep 1

Commit

2189cdb

•

1 Parent(s): 849e15a

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -96

app.py CHANGED Viewed

@@ -1,102 +1,48 @@
-#!pip install PyPDF2 pandas transformers torch accelerate streamlit
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
-import PyPDF2
-import pandas as pd
 import streamlit as st
-# Function to extract text from PDF
-def extract_text_from_pdf(uploaded_file):
-    pdf_text = ""
-    reader = PyPDF2.PdfReader(uploaded_file)
-    for page_num in range(len(reader.pages)):
-        page = reader.pages[page_num]
-        pdf_text += page.extract_text()
-    return pdf_text
-# Function to extract text from CSV
-def extract_text_from_csv(uploaded_file):
-    df = pd.read_csv(uploaded_file)
-    csv_text = df.to_string(index=False)
-    return csv_text
-# Initialize the tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained("ricepaper/vi-gemma-2b-RAG")
-model = AutoModelForCausalLM.from_pretrained(
-    "ricepaper/vi-gemma-2b-RAG",
-    torch_dtype=torch.bfloat16
-)
-# Move model to GPU if available
-if torch.cuda.is_available():
-    model.to("cuda")
-# Define the prompt format for the model
-prompt = """
-### Instruction and Input:
-Based on the following context/document:
-{}
-Please answer the question: {}
-### Response:
-{}
-"""
-# Function to generate answer based on query and context
-def generate_answer(context, query):
-    input_text = prompt.format(context, query, "")
-    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
-    # Use GPU for input ids if available
-    if torch.cuda.is_available():
-        input_ids = input_ids.to("cuda")
-    # Generate text using the model
-    outputs = model.generate(
-        **input_ids,
-        max_new_tokens=500,
-        no_repeat_ngram_size=5,
-    )
-    # Decode and print the results
-    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return answer
-# Streamlit App
-st.title("RAG-Based Multi-File Question Answering Application")
-# Upload PDF or CSV
-uploaded_files = st.file_uploader("Upload PDF or CSV files", type=['pdf', 'csv'], accept_multiple_files=True)
 if uploaded_files:
-    combined_text = ""
-    # Process each uploaded file
-    for uploaded_file in uploaded_files:
-        if uploaded_file.type == "application/pdf":
-            # Extract text from PDF
-            pdf_text = extract_text_from_pdf(uploaded_file)
-            combined_text += pdf_text + "\n"
-            st.write(f"Extracted text from PDF: {uploaded_file.name}")
-        elif uploaded_file.type == "text/csv":
-            # Extract text from CSV
-            csv_text = extract_text_from_csv(uploaded_file)
-            combined_text += csv_text + "\n"
-            st.write(f"Extracted text from CSV: {uploaded_file.name}")
-    st.text_area("Combined File Content", combined_text, height=200)
-    # User inputs their question
-    query = st.text_input("Enter your question about the uploaded content:")
     if st.button("Get Answer"):
-        if query.strip() != "":
-            # Generate answer based on combined extracted text and the query
-            answer = generate_answer(combined_text, query)
-            st.write("Answer:", answer)
         else:
-            st.warning("Please enter a question.")
 else:
-    st.info("Please upload PDF or CSV files to get started.")

 import streamlit as st
+import fitz  # PyMuPDF
+from transformers import pipeline
+import glob
+# Function to extract text from PDFs
+def extract_text_from_pdfs(pdf_files):
+    pdf_texts = {}
+    for pdf_file in pdf_files:
+        with fitz.open(pdf_file) as doc:
+            text = ""
+            for page in doc:
+                text += page.get_text()
+            pdf_texts[pdf_file] = text
+    return pdf_texts
+# Load pre-trained QA model
+qa_pipeline = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')
+# Function to answer questions based on extracted text
+def answer_question(pdf_texts, question):
+    context = " ".join(pdf_texts.values())
+    result = qa_pipeline(question=question, context=context)
+    return result['answer']
+# Streamlit application
+st.title("PDF Question Answering App")
+# File uploader for PDF files
+uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
+# Display uploaded files
 if uploaded_files:
+    # Extract text from PDFs
+    pdf_texts = extract_text_from_pdfs([file.name for file in uploaded_files])
+    st.write("PDFs Uploaded Successfully!")
+    # Question input
+    question = st.text_input("Enter your question:")
     if st.button("Get Answer"):
+        if question:
+            answer = answer_question(pdf_texts, question)
+            st.write(f"Answer: {answer}")
         else:
+            st.write("Please enter a question.")
 else:
+    st.write("Please upload PDF files to continue.")