Shanulhaq commited on
Commit
2189cdb
1 Parent(s): 849e15a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -96
app.py CHANGED
@@ -1,102 +1,48 @@
1
- #!pip install PyPDF2 pandas transformers torch accelerate streamlit
2
-
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
- import torch
5
- import PyPDF2
6
- import pandas as pd
7
  import streamlit as st
8
-
9
- # Function to extract text from PDF
10
- def extract_text_from_pdf(uploaded_file):
11
- pdf_text = ""
12
- reader = PyPDF2.PdfReader(uploaded_file)
13
- for page_num in range(len(reader.pages)):
14
- page = reader.pages[page_num]
15
- pdf_text += page.extract_text()
16
- return pdf_text
17
-
18
- # Function to extract text from CSV
19
- def extract_text_from_csv(uploaded_file):
20
- df = pd.read_csv(uploaded_file)
21
- csv_text = df.to_string(index=False)
22
- return csv_text
23
-
24
- # Initialize the tokenizer and model
25
- tokenizer = AutoTokenizer.from_pretrained("ricepaper/vi-gemma-2b-RAG")
26
-
27
- model = AutoModelForCausalLM.from_pretrained(
28
- "ricepaper/vi-gemma-2b-RAG",
29
- torch_dtype=torch.bfloat16
30
- )
31
-
32
- # Move model to GPU if available
33
- if torch.cuda.is_available():
34
- model.to("cuda")
35
-
36
- # Define the prompt format for the model
37
- prompt = """
38
- ### Instruction and Input:
39
- Based on the following context/document:
40
- {}
41
- Please answer the question: {}
42
- ### Response:
43
- {}
44
- """
45
-
46
- # Function to generate answer based on query and context
47
- def generate_answer(context, query):
48
- input_text = prompt.format(context, query, "")
49
- input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
50
-
51
- # Use GPU for input ids if available
52
- if torch.cuda.is_available():
53
- input_ids = input_ids.to("cuda")
54
-
55
- # Generate text using the model
56
- outputs = model.generate(
57
- **input_ids,
58
- max_new_tokens=500,
59
- no_repeat_ngram_size=5,
60
- )
61
-
62
- # Decode and print the results
63
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
64
- return answer
65
-
66
- # Streamlit App
67
- st.title("RAG-Based Multi-File Question Answering Application")
68
-
69
- # Upload PDF or CSV
70
- uploaded_files = st.file_uploader("Upload PDF or CSV files", type=['pdf', 'csv'], accept_multiple_files=True)
71
-
72
  if uploaded_files:
73
- combined_text = ""
74
-
75
- # Process each uploaded file
76
- for uploaded_file in uploaded_files:
77
- if uploaded_file.type == "application/pdf":
78
- # Extract text from PDF
79
- pdf_text = extract_text_from_pdf(uploaded_file)
80
- combined_text += pdf_text + "\n"
81
- st.write(f"Extracted text from PDF: {uploaded_file.name}")
82
-
83
- elif uploaded_file.type == "text/csv":
84
- # Extract text from CSV
85
- csv_text = extract_text_from_csv(uploaded_file)
86
- combined_text += csv_text + "\n"
87
- st.write(f"Extracted text from CSV: {uploaded_file.name}")
88
-
89
- st.text_area("Combined File Content", combined_text, height=200)
90
-
91
- # User inputs their question
92
- query = st.text_input("Enter your question about the uploaded content:")
93
 
 
 
 
94
  if st.button("Get Answer"):
95
- if query.strip() != "":
96
- # Generate answer based on combined extracted text and the query
97
- answer = generate_answer(combined_text, query)
98
- st.write("Answer:", answer)
99
  else:
100
- st.warning("Please enter a question.")
101
  else:
102
- st.info("Please upload PDF or CSV files to get started.")
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import fitz # PyMuPDF
3
+ from transformers import pipeline
4
+ import glob
5
+
6
+ # Function to extract text from PDFs
7
+ def extract_text_from_pdfs(pdf_files):
8
+ pdf_texts = {}
9
+ for pdf_file in pdf_files:
10
+ with fitz.open(pdf_file) as doc:
11
+ text = ""
12
+ for page in doc:
13
+ text += page.get_text()
14
+ pdf_texts[pdf_file] = text
15
+ return pdf_texts
16
+
17
+ # Load pre-trained QA model
18
+ qa_pipeline = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')
19
+
20
+ # Function to answer questions based on extracted text
21
+ def answer_question(pdf_texts, question):
22
+ context = " ".join(pdf_texts.values())
23
+ result = qa_pipeline(question=question, context=context)
24
+ return result['answer']
25
+
26
+ # Streamlit application
27
+ st.title("PDF Question Answering App")
28
+
29
+ # File uploader for PDF files
30
+ uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
31
+
32
+ # Display uploaded files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  if uploaded_files:
34
+ # Extract text from PDFs
35
+ pdf_texts = extract_text_from_pdfs([file.name for file in uploaded_files])
36
+ st.write("PDFs Uploaded Successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Question input
39
+ question = st.text_input("Enter your question:")
40
+
41
  if st.button("Get Answer"):
42
+ if question:
43
+ answer = answer_question(pdf_texts, question)
44
+ st.write(f"Answer: {answer}")
 
45
  else:
46
+ st.write("Please enter a question.")
47
  else:
48
+ st.write("Please upload PDF files to continue.")