Shanulhaq commited on
Commit
9a8819b
1 Parent(s): 4672bdd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install necessary libraries
2
+ !pip install PyPDF2 transformers torch accelerate
3
+
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ import torch
6
+ import PyPDF2
7
+
8
+ # Step 1: Extract text from PDF
9
+ def extract_text_from_pdf(pdf_path):
10
+ pdf_text = ""
11
+ with open(pdf_path, "rb") as file:
12
+ reader = PyPDF2.PdfReader(file)
13
+ for page_num in range(len(reader.pages)):
14
+ page = reader.pages[page_num]
15
+ pdf_text += page.extract_text()
16
+ return pdf_text
17
+
18
+ # Step 2: Initialize the tokenizer and model
19
+ tokenizer = AutoTokenizer.from_pretrained("ricepaper/vi-gemma-2b-RAG")
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ "ricepaper/vi-gemma-2b-RAG",
22
+ device_map="auto",
23
+ torch_dtype=torch.bfloat16
24
+ )
25
+
26
+ # Use GPU if available
27
+ if torch.cuda.is_available():
28
+ model.to("cuda")
29
+
30
+ # Step 3: Define the prompt format for the model
31
+ prompt = """
32
+ ### Instruction and Input:
33
+ Based on the following context/document:
34
+ {}
35
+ Please answer the question: {}
36
+
37
+ ### Response:
38
+ {}
39
+ """
40
+
41
+ # Step 4: Function to generate answer based on query and context
42
+ def generate_answer(context, query):
43
+ input_text = prompt.format(context, query, "")
44
+ input_ids = tokenizer(input_text, return_tensors="pt")
45
+
46
+ # Use GPU for input ids if available
47
+ if torch.cuda.is_available():
48
+ input_ids = input_ids.to("cuda")
49
+
50
+ # Generate text using the model
51
+ outputs = model.generate(
52
+ **input_ids,
53
+ max_new_tokens=500,
54
+ no_repeat_ngram_size=5,
55
+ )
56
+
57
+ # Decode and print the results
58
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
59
+ return answer
60
+
61
+ # Step 5: Read PDF and generate answer
62
+ pdf_path = '/content/monopoly.pdf' # Replace with your PDF file path
63
+ pdf_text = extract_text_from_pdf(pdf_path)
64
+ query = "Your question here" # Replace with your question
65
+
66
+ # Generate answer based on extracted PDF text and the query
67
+ answer = generate_answer(pdf_text, query)
68
+ print("Answer:", answer)