# import chromadb # from chromadb.utils import embedding_functions # from sentence_transformers import SentenceTransformer # from transformers import pipeline # import streamlit as st # import fitz # PyMuPDF for PDF parsing # # Step 1: Setup ChromaDB # def setup_chromadb(): # # Initialize ChromaDB in-memory instance # client = chromadb.Client() # try: # client.delete_collection("pdf_data") # print("Existing collection 'pdf_data' deleted.") # except: # print("Collection 'pdf_data' not found, creating a new one.") # # Create a new collection with the embedding function # ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2") # collection = client.create_collection("pdf_data", embedding_function=ef) # return client, collection # # Step 2: Extract Text from PDF # def extract_text_from_pdf(pdf_path): # pdf_text = "" # with fitz.open(pdf_path) as doc: # for page in doc: # pdf_text += page.get_text() # return pdf_text # # Step 3: Add Extracted Text to Vector Database # def add_pdf_text_to_db(collection, pdf_text): # sentences = pdf_text.split("\n") # Split text into lines for granularity # for idx, sentence in enumerate(sentences): # if sentence.strip(): # Avoid empty lines # collection.add( # ids=[f"pdf_text_{idx}"], # documents=[sentence], # metadatas={"line_number": idx, "text": sentence} # ) # # Step 4: Query Function # def query_pdf_data(collection, query, retriever_model): # results = collection.query( # query_texts=[query], # n_results=3 # ) # context = " ".join([doc for doc in results["documents"][0]]) # answer = retriever_model(f"Context: {context}\nQuestion: {query}") # return answer, results["metadatas"] # # Streamlit Interface # def main(): # st.title("PDF Chatbot with Retrieval-Augmented Generation") # st.write("Upload a PDF, and ask questions about its content!") # # Initialize components # client, collection = setup_chromadb() # retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM # # File upload # uploaded_file = st.file_uploader("Upload your PDF file", type="pdf") # if uploaded_file: # st.write("Extracting text and populating the database...") # pdf_text = extract_text_from_pdf(uploaded_file) # add_pdf_text_to_db(collection, pdf_text) # st.success("PDF text has been added to the database. You can now query it!") # # Query Input # query = st.text_input("Enter your query about the PDF:") # if query: # try: # answer, metadata = query_pdf_data(collection, query, retriever_model) # st.subheader("Answer:") # st.write(answer[0]['generated_text']) # st.subheader("Retrieved Context:") # for meta in metadata[0]: # st.write(meta) # except Exception as e: # st.error(f"An error occurred: {str(e)}") # if __name__ == "__main__": # main() import streamlit as st from streamlit_chromadb_connection.chromadb_connection import ChromadbConnection configuration = { "client": "PersistentClient", "path": "/tmp/.chroma" } collection_name = "documents_collection" conn = st.connection("chromadb", type=ChromaDBConnection, **configuration) documents_collection_df = conn.get_collection_data(collection_name) st.dataframe(documents_collection_df) # import tempfile # import PyPDF2 # import streamlit as st # from transformers import GPT2LMHeadModel, GPT2Tokenizer # # Load pre-trained GPT-3 model and tokenizer # tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") # model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2") # def extract_text_from_pdf(file_path): # text = "" # with open(file_path, "rb") as f: # reader = PyPDF2.PdfFileReader(f) # for page_num in range(reader.numPages): # text += reader.getPage(page_num).extractText() # return text # def generate_response(user_input): # input_ids = tokenizer.encode(user_input, return_tensors="pt") # output = model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=0.7) # response = tokenizer.decode(output[0], skip_special_tokens=True) # return response # def main(): # st.title("PDF Chatbot") # pdf_file = st.file_uploader("Upload an pdf file", type=["pdf"], accept_multiple_files=False) # if pdf_file is not None: # with tempfile.NamedTemporaryFile(delete=False) as tmp_file: # tmp_file.write(pdf_file.read()) # st.success("PDF file successfully uploaded and stored temporally.") # file_path = tmp_file.name # pdf_text = extract_text_from_pdf(file_path) # st.text_area("PDF Content", pdf_text) # else: # st.markdown('File not found!') # user_input = st.text_input("You:", "") # if st.button("Send"): # response = generate_response(user_input) # st.text_area("Chatbot:", response) # if __name__ == "__main__": # main()