Shanulhaq commited on
Commit
0198065
1 Parent(s): 380b4ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -16
app.py CHANGED
@@ -1,15 +1,12 @@
1
- #!pip install streamlit
2
-
3
-
4
  # Install necessary libraries
5
- #pip install PyPDF2 transformers torch accelerate streamlit
6
 
7
  from transformers import AutoTokenizer, AutoModelForCausalLM
8
  import torch
9
  import PyPDF2
10
  import streamlit as st
11
 
12
- # Step 1: Extract text from PDF
13
  def extract_text_from_pdf(pdf_path):
14
  pdf_text = ""
15
  with open(pdf_path, "rb") as file:
@@ -19,7 +16,7 @@ def extract_text_from_pdf(pdf_path):
19
  pdf_text += page.extract_text()
20
  return pdf_text
21
 
22
- # Step 2: Initialize the tokenizer and model
23
  tokenizer = AutoTokenizer.from_pretrained("ricepaper/vi-gemma-2b-RAG")
24
  model = AutoModelForCausalLM.from_pretrained(
25
  "ricepaper/vi-gemma-2b-RAG",
@@ -31,7 +28,7 @@ model = AutoModelForCausalLM.from_pretrained(
31
  if torch.cuda.is_available():
32
  model.to("cuda")
33
 
34
- # Step 3: Define the prompt format for the model
35
  prompt = """
36
  ### Instruction and Input:
37
  Based on the following context/document:
@@ -42,10 +39,10 @@ Please answer the question: {}
42
  {}
43
  """
44
 
45
- # Step 4: Function to generate answer based on query and context
46
  def generate_answer(context, query):
47
  input_text = prompt.format(context, query, "")
48
- input_ids = tokenizer(input_text, return_tensors="pt")
49
 
50
  # Use GPU for input ids if available
51
  if torch.cuda.is_available():
@@ -62,11 +59,28 @@ def generate_answer(context, query):
62
  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
63
  return answer
64
 
65
- # Step 5: Read PDF and generate answer
66
- pdf_path = '/content/monopoly.pdf' # Replace with your PDF file path
67
- pdf_text = extract_text_from_pdf(pdf_path)
68
- query = "Your question here" # Replace with your question
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- # Generate answer based on extracted PDF text and the query
71
- answer = generate_answer(pdf_text, query)
72
- print("Answer:", answer)
 
 
 
 
 
 
 
 
 
 
1
  # Install necessary libraries
2
+ #!pip install PyPDF2 transformers torch accelerate streamlit
3
 
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import torch
6
  import PyPDF2
7
  import streamlit as st
8
 
9
+ # Function to extract text from PDF
10
  def extract_text_from_pdf(pdf_path):
11
  pdf_text = ""
12
  with open(pdf_path, "rb") as file:
 
16
  pdf_text += page.extract_text()
17
  return pdf_text
18
 
19
+ # Initialize the tokenizer and model
20
  tokenizer = AutoTokenizer.from_pretrained("ricepaper/vi-gemma-2b-RAG")
21
  model = AutoModelForCausalLM.from_pretrained(
22
  "ricepaper/vi-gemma-2b-RAG",
 
28
  if torch.cuda.is_available():
29
  model.to("cuda")
30
 
31
+ # Define the prompt format for the model
32
  prompt = """
33
  ### Instruction and Input:
34
  Based on the following context/document:
 
39
  {}
40
  """
41
 
42
+ # Function to generate answer based on query and context
43
  def generate_answer(context, query):
44
  input_text = prompt.format(context, query, "")
45
+ input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
46
 
47
  # Use GPU for input ids if available
48
  if torch.cuda.is_available():
 
59
  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
60
  return answer
61
 
62
+ # Streamlit App
63
+ st.title("RAG-Based PDF Question Answering Application")
64
+
65
+ # Upload PDF
66
+ uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
67
+
68
+ if uploaded_file is not None:
69
+ # Extract text from the uploaded PDF
70
+ pdf_text = extract_text_from_pdf(uploaded_file)
71
+
72
+ st.write("Extracted text from PDF:")
73
+ st.text_area("PDF Content", pdf_text, height=200)
74
+
75
+ # User inputs their question
76
+ query = st.text_input("Enter your question about the PDF content:")
77
 
78
+ if st.button("Get Answer"):
79
+ if query.strip() != "":
80
+ # Generate answer based on extracted PDF text and the query
81
+ answer = generate_answer(pdf_text, query)
82
+ st.write("Answer:", answer)
83
+ else:
84
+ st.warning("Please enter a question.")
85
+ else:
86
+ st.info("Please upload a PDF file to get started.")