# import os # import sys # import openai # from langchain.chains import ConversationalRetrievalChain, RetrievalQA # from langchain.chat_models import ChatOpenAI # from langchain.document_loaders import DirectoryLoader, TextLoader # from langchain.embeddings import OpenAIEmbeddings # from langchain.indexes import VectorstoreIndexCreator # from langchain.indexes.vectorstore import VectorStoreIndexWrapper # from langchain.llms import OpenAI # from langchain.text_splitter import CharacterTextSplitter # __import__('pysqlite3') # import sys # sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') # from langchain.vectorstores import Chroma # import gradio as gr # os.environ["OPENAI_API_KEY"] = os.getenv("OPENAPIKEY") # docs = [] # for f in os.listdir("multiple_docs"): # if f.endswith(".pdf"): # pdf_path = "./multiple_docs/" + f # loader = PyPDFLoader(pdf_path) # docs.extend(loader.load()) # elif f.endswith('.docx') or f.endswith('.doc'): # doc_path = "./multiple_docs/" + f # loader = Docx2txtLoader(doc_path) # docs.extend(loader.load()) # elif f.endswith('.txt'): # text_path = "./multiple_docs/" + f # loader = TextLoader(text_path) # docs.extend(loader.load()) # splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) # docs = splitter.split_documents(docs) # # Convert the document chunks to embedding and save them to the vector store # vectorstore = Chroma.from_documents(docs, embedding=OpenAIEmbeddings(), persist_directory="./data") # vectorstore.persist() # chain = ConversationalRetrievalChain.from_llm( # ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo'), # retriever=vectorstore.as_retriever(search_kwargs={'k': 6}), # return_source_documents=True, # verbose=False # ) # chat_history = [] # with gr.Blocks() as demo: # chatbot = gr.Chatbot([("", "Hello, I'm Thierry Decae's chatbot, you can ask me any recruitment related questions such as my previous or most recent experience, where I'm eligible to work, when I can start work, what NLP skills I have, and much more! you can chat with me directly in multiple languages")],avatar_images=["./multiple_docs/Guest.jpg","./multiple_docs/Thierry Picture.jpg"]) # msg = gr.Textbox() # clear = gr.Button("Clear") # chat_history = [] # def user(query, chat_history): # # print("User query:", query) # # print("Chat history:", chat_history) # # Convert chat history to list of tuples # chat_history_tuples = [] # for message in chat_history: # chat_history_tuples.append((message[0], message[1])) # # Get result from QA chain # result = chain({"question": query, "chat_history": chat_history_tuples}) # # Append user message and response to chat history # chat_history.append((query, result["answer"])) # # print("Updated chat history:", chat_history) # return gr.update(value=""), chat_history # msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False) # clear.click(lambda: None, None, chatbot, queue=False) # demo.launch(debug=True) import os import sys from langchain.chains import ConversationalRetrievalChain from langchain.document_loaders import DirectoryLoader, TextLoader from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import Chroma import gradio as gr from transformers import pipeline from sentence_transformers import SentenceTransformer __import__('pysqlite3') sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') docs = [] for f in os.listdir("multiple_docs"): if f.endswith(".pdf"): pdf_path = "./multiple_docs/" + f loader = PyPDFLoader(pdf_path) docs.extend(loader.load()) elif f.endswith('.docx') or f.endswith('.doc'): doc_path = "./multiple_docs/" + f loader = Docx2txtLoader(doc_path) docs.extend(loader.load()) elif f.endswith('.txt'): text_path = "./multiple_docs/" + f loader = TextLoader(text_path) docs.extend(loader.load()) splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) docs = splitter.split_documents(docs) # Extract the content from documents and create embeddings embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") texts = [doc.page_content for doc in docs] embeddings = embedding_model.encode(texts).tolist() # Convert numpy arrays to lists # Create a Chroma vector store and add documents and their embeddings vectorstore = Chroma(persist_directory="./data") for i, (text, embedding) in enumerate(zip(texts, embeddings)): vectorstore.add_texts([text], metadatas=[{"id": i}], embeddings=[embedding]) vectorstore.persist() # Load the Hugging Face model for text generation generator = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B") class HuggingFaceLLMWrapper: def __init__(self, generator): self.generator = generator def __call__(self, prompt, max_length=512): result = self.generator(prompt, max_length=max_length, num_return_sequences=1) return result[0]['generated_text'] llm = HuggingFaceLLMWrapper(generator) chain = ConversationalRetrievalChain.from_llm( llm, retriever=vectorstore.as_retriever(search_kwargs={'k': 6}), return_source_documents=True, verbose=False ) chat_history = [] with gr.Blocks() as demo: chatbot = gr.Chatbot([("", "Hello, I'm Thierry Decae's chatbot, you can ask me any recruitment related questions such as my previous or most recent experience, where I'm eligible to work, when I can start work, what NLP skills I have, and much more! you can chat with me directly in multiple languages")], avatar_images=["./multiple_docs/Guest.jpg","./multiple_docs/Thierry Picture.jpg"]) msg = gr.Textbox() clear = gr.Button("Clear") chat_history = [] def user(query, chat_history): # Convert chat history to list of tuples chat_history_tuples = [] for message in chat_history: chat_history_tuples.append((message[0], message[1])) # Get result from QA chain result = chain({"question": query, "chat_history": chat_history_tuples}) # Append user message and response to chat history chat_history.append((query, result["answer"])) return gr.update(value=""), chat_history msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False) clear.click(lambda: None, None, chatbot, queue=False) demo.launch(debug=True)