import streamlit as st from transformers import DPRQuestionEncoder, DPRContextEncoder, DPRQuestionEncoderTokenizer from transformers import BartForConditionalGeneration, BartTokenizer from sentence_transformers import SentenceTransformer import pdfplumber import faiss import torch # Load the Question Encoder, Context Encoder, and Tokenizers question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base") question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base") # Load the Generator Model generator = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") generator_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn") # Load Sentence Embedding Model for Vector Store sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') # Initialize FAISS index for fast similarity search index = faiss.IndexFlatIP(384) # Initialize documents list with some sample documents documents = [ "Streamlit is an open-source Python library that makes it easy to build beautiful custom web-apps for machine learning and data science.", "Hugging Face is a company that provides tools and models for natural language processing (NLP).", "Retrieval-Augmented Generation (RAG) is a method that combines document retrieval with a generative model for question answering.", ] # Encode the initial documents and add to FAISS index doc_embeddings = sentence_model.encode(documents, convert_to_tensor=True).cpu().detach().numpy() index.add(doc_embeddings) # Streamlit Frontend st.set_page_config(page_title="RAG-based PDF Query Application", layout="wide") st.title("📄 Retrieval-Augmented Generation (RAG) Application") # File Upload for PDF Documents uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") if uploaded_file: # Extract text from PDF pdf_text = "" with pdfplumber.open(uploaded_file) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: # Check if text was extracted pdf_text += page_text if pdf_text: # Add the PDF text to the documents list and FAISS index documents.append(pdf_text) pdf_embedding = sentence_model.encode([pdf_text], convert_to_tensor=True).cpu().detach().numpy() index.add(pdf_embedding) st.success("PDF text added to knowledge base for querying!") else: st.error("No text could be extracted from the PDF.") # User Input st.markdown("Enter your query below:") query = st.text_input("🔍 Enter your query") if st.button("💬 Get Answer"): if query: # Step 1: Encode the query question_inputs = question_tokenizer(query, return_tensors="pt") question_embedding = question_encoder(**question_inputs).pooler_output.detach().cpu().numpy() # Step 2: Perform FAISS search for document retrieval _, retrieved_doc_indices = index.search(question_embedding, k=3) # Retrieve top 3 relevant documents # Step 3: Retrieve the top documents retrieved_docs = [documents[idx] for idx in retrieved_doc_indices[0]] # Step 4: Concatenate retrieved documents context = " ".join(retrieved_docs) # Step 5: Use the Generator to Answer the Question input_ids = generator_tokenizer.encode(f"question: {query} context: {context}", return_tensors="pt") outputs = generator.generate(input_ids, max_length=200, num_return_sequences=1) # Decode and display the response answer = generator_tokenizer.decode(outputs[0], skip_special_tokens=True) st.write("**Answer:**") st.write(answer) else: st.error("Please enter a query.")