#!pip install streamlit # Install necessary libraries !pip install PyPDF2 transformers torch accelerate streamlit from transformers import AutoTokenizer, AutoModelForCausalLM import torch import PyPDF2 import streamlit as st # Step 1: Extract text from PDF def extract_text_from_pdf(pdf_path): pdf_text = "" with open(pdf_path, "rb") as file: reader = PyPDF2.PdfReader(file) for page_num in range(len(reader.pages)): page = reader.pages[page_num] pdf_text += page.extract_text() return pdf_text # Step 2: Initialize the tokenizer and model tokenizer = AutoTokenizer.from_pretrained("ricepaper/vi-gemma-2b-RAG") model = AutoModelForCausalLM.from_pretrained( "ricepaper/vi-gemma-2b-RAG", device_map="auto", torch_dtype=torch.bfloat16 ) # Use GPU if available if torch.cuda.is_available(): model.to("cuda") # Step 3: Define the prompt format for the model prompt = """ ### Instruction and Input: Based on the following context/document: {} Please answer the question: {} ### Response: {} """ # Step 4: Function to generate answer based on query and context def generate_answer(context, query): input_text = prompt.format(context, query, "") input_ids = tokenizer(input_text, return_tensors="pt") # Use GPU for input ids if available if torch.cuda.is_available(): input_ids = input_ids.to("cuda") # Generate text using the model outputs = model.generate( **input_ids, max_new_tokens=500, no_repeat_ngram_size=5, ) # Decode and print the results answer = tokenizer.decode(outputs[0], skip_special_tokens=True) return answer # Step 5: Read PDF and generate answer pdf_path = '/content/monopoly.pdf' # Replace with your PDF file path pdf_text = extract_text_from_pdf(pdf_path) query = "Your question here" # Replace with your question # Generate answer based on extracted PDF text and the query answer = generate_answer(pdf_text, query) print("Answer:", answer)