Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

trgardos commited on Jan 12

Commit

9d92b71

•

2 Parent(s): b5be549 a24065c

Merge pull request #1 from DL4DS/initial_commit

Browse files

Files changed (20) hide show

.gitignore +3 -0
README.md +13 -1
code/.chainlit/config.toml +84 -0
code/chainlit.md +8 -0
code/config.yml +27 -0
code/main.py +109 -0
code/modules/__init__.py +0 -0
code/modules/chat_model_loader.py +25 -0
code/modules/constants.py +33 -0
code/modules/data_loader.py +248 -0
code/modules/embedding_model_loader.py +23 -0
code/modules/llm_tutor.py +84 -0
code/modules/vector_db.py +107 -0
code/vectorstores/db_FAISS_sentence-transformers/all-MiniLM-L6-v2/index.faiss +0 -0
code/vectorstores/db_FAISS_sentence-transformers/all-MiniLM-L6-v2/index.pkl +0 -0
code/vectorstores/db_FAISS_text-embedding-ada-002/index.faiss +0 -0
code/vectorstores/db_FAISS_text-embedding-ada-002/index.pkl +0 -0
data/webpage.pdf +0 -0
docs/README.md +2 -0
requirements.txt +14 -0

.gitignore CHANGED Viewed

@@ -158,3 +158,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/

 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+# log files
+*.log

README.md CHANGED Viewed

	@@ -1 +1,13 @@
1	- # dl4ds_tutor

+# dl4ds_tutor
+## Setup
+1. conda create -n dl4ds_tutor python=3.9
+2. conda activate dl4ds_tutor
+3. pip install -r requirements.txt
+4. Create a .env file and add your openai api key as 'OPENAI_API_KEY=XXX'
+## Instructions
+1. Add files to `data/`
+2. cd code
+3. chainlit run main.py

code/.chainlit/config.toml ADDED Viewed

	@@ -0,0 +1,84 @@

+[project]
+# Whether to enable telemetry (default: true). No personal data is collected.
+enable_telemetry = true
+# List of environment variables to be provided by each user to use the app.
+user_env = []
+# Duration (in seconds) during which the session is saved when the connection is lost
+session_timeout = 3600
+# Enable third parties caching (e.g LangChain cache)
+cache = false
+# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
+# follow_symlink = false
+[features]
+# Show the prompt playground
+prompt_playground = true
+# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
+unsafe_allow_html = false
+# Process and display mathematical expressions. This can clash with "$" characters in messages.
+latex = false
+# Authorize users to upload files with messages
+multi_modal = true
+# Allows user to use speech to text
+[features.speech_to_text]
+    enabled = false
+    # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
+    # language = "en-US"
+[UI]
+# Name of the app and chatbot.
+name = "LLM Tutor"
+# Show the readme while the conversation is empty.
+show_readme_as_default = true
+# Description of the app and chatbot. This is used for HTML tags.
+# description = ""
+# Large size content are by default collapsed for a cleaner ui
+default_collapse_content = true
+# The default value for the expand messages settings.
+default_expand_messages = false
+# Hide the chain of thought details from the user in the UI.
+hide_cot = false
+# Link to your github repo. This will add a github button in the UI's header.
+# github = "https://github.com/DL4DS/dl4ds_tutor"
+# Specify a CSS file that can be used to customize the user interface.
+# The CSS file can be served from the public directory or via an external link.
+# custom_css = "/public/test.css"
+# Override default MUI light theme. (Check theme.ts)
+[UI.theme.light]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+    [UI.theme.light.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+# Override default MUI dark theme. (Check theme.ts)
+[UI.theme.dark]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+    [UI.theme.dark.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+[meta]
+generated_by = "0.7.700"

code/chainlit.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# Welcome to DL4DS Tutor! 🚀🤖
+Hi there, this is an LLM chatbot designed to help answer questions on the course content, built using Langchain and Chainlit.
+This is still very much a Work in Progress.
+## Useful Links 🔗
+- **Documentation:**  [Chainlit Documentation](https://docs.chainlit.io) 📚

code/config.yml ADDED Viewed

	@@ -0,0 +1,27 @@

+embedding_options:
+  embedd_files: True # bool
+  persist_directory: null # str or None
+  data_path: '../data' # str
+  db_option : 'FAISS' # str
+  db_path : 'vectorstores' # str
+  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
+  search_top_k : 5 # int
+llm_params:
+  use_history: True # bool
+  llm_loader: 'openai' # str [ctransformers, openai]
+  openai_params:
+    model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
+  ctransformers_params:
+    model: "TheBloke/Llama-2-7B-Chat-GGML"
+    model_type: "llama"
+splitter_options:
+  use_splitter: True # bool
+  split_by_token : True # bool
+  remove_leftover_delimiters: True # bool
+  remove_chunks: False # bool
+  chunk_size : 800 # int
+  chunk_overlap : 80 # int
+  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
+  front_chunks_to_remove : null # int or None
+  last_chunks_to_remove : null # int or None
+  delimiters_to_remove : ['\t', '\n', '   ', '  '] # list of strings

code/main.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from langchain.document_loaders import PyPDFLoader, DirectoryLoader
+from langchain import PromptTemplate
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.chains import RetrievalQA
+from langchain.llms import CTransformers
+import chainlit as cl
+from langchain_community.chat_models import ChatOpenAI
+from langchain_community.embeddings import OpenAIEmbeddings
+import yaml
+import logging
+from dotenv import load_dotenv
+from modules.llm_tutor import LLMTutor
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+# Console Handler
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+console_handler.setFormatter(formatter)
+logger.addHandler(console_handler)
+# File Handler
+log_file_path = "log_file.log"  # Change this to your desired log file path
+file_handler = logging.FileHandler(log_file_path)
+file_handler.setLevel(logging.INFO)
+file_handler.setFormatter(formatter)
+logger.addHandler(file_handler)
+with open("config.yml", "r") as f:
+    config = yaml.safe_load(f)
+print(config)
+logger.info("Config file loaded")
+logger.info(f"Config: {config}")
+logger.info("Creating llm_tutor instance")
+llm_tutor = LLMTutor(config, logger=logger)
+# chainlit code
+@cl.on_chat_start
+async def start():
+    chain = llm_tutor.qa_bot()
+    msg = cl.Message(content="Starting the bot...")
+    await msg.send()
+    msg.content = "Hey, What Can I Help You With?"
+    await msg.update()
+    cl.user_session.set("chain", chain)
+@cl.on_message
+async def main(message):
+    chain = cl.user_session.get("chain")
+    cb = cl.AsyncLangchainCallbackHandler(
+        stream_final_answer=True, answer_prefix_tokens=["FINAL", "ANSWER"]
+    )
+    cb.answer_reached = True
+    # res=await chain.acall(message, callbacks=[cb])
+    res = await chain.acall(message.content, callbacks=[cb])
+    # print(f"response: {res}")
+    try:
+        answer = res["answer"]
+    except:
+        answer = res["result"]
+    print(f"answer: {answer}")
+    source_elements_dict = {}
+    source_elements = []
+    found_sources = []
+    for idx, source in enumerate(res["source_documents"]):
+        title = source.metadata["source"]
+        if title not in source_elements_dict:
+            source_elements_dict[title] = {
+                "page_number": [source.metadata["page"]],
+                "url": source.metadata["source"],
+                "content": source.page_content,
+            }
+        else:
+            source_elements_dict[title]["page_number"].append(source.metadata["page"])
+        source_elements_dict[title][
+            "content_" + str(source.metadata["page"])
+        ] = source.page_content
+        # sort the page numbers
+        # source_elements_dict[title]["page_number"].sort()
+    for title, source in source_elements_dict.items():
+        # create a string for the page numbers
+        page_numbers = ", ".join([str(x) for x in source["page_number"]])
+        text_for_source = f"Page Number(s): {page_numbers}\nURL: {source['url']}"
+        source_elements.append(cl.Pdf(name="File", path=title))
+        found_sources.append("File")
+        # for pn in source["page_number"]:
+        #     source_elements.append(
+        #         cl.Text(name=str(pn), content=source["content_"+str(pn)])
+        #     )
+        #     found_sources.append(str(pn))
+    if found_sources:
+        answer += f"\nSource:{', '.join(found_sources)}"
+    else:
+        answer += f"\nNo source found."
+    await cl.Message(content=answer, elements=source_elements).send()

code/modules/__init__.py ADDED Viewed

File without changes

code/modules/chat_model_loader.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from langchain_community.chat_models import ChatOpenAI
+from langchain.llms import CTransformers
+class ChatModelLoader:
+    def __init__(self, config):
+        self.config = config
+    def load_chat_model(self):
+        if self.config["llm_params"]["llm_loader"] == "openai":
+            llm = ChatOpenAI(
+                model_name=self.config["llm_params"]["openai_params"]["model"]
+            )
+        elif self.config["llm_params"]["llm_loader"] == "Ctransformers":
+            llm = CTransformers(
+                model=self.config["llm_params"]["ctransformers_params"]["model"],
+                model_type=self.config["llm_params"]["ctransformers_params"][
+                    "model_type"
+                ],
+                max_new_tokens=512,
+                temperature=0.5,
+            )
+        else:
+            raise ValueError("Invalid LLM Loader")
+        return llm

code/modules/constants.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from dotenv import load_dotenv
+import os
+load_dotenv()
+# API Keys - Loaded from the .env file
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+# Prompt Templates
+prompt_template = """Use the following pieces of information to answer the user's question.
+If you don't know the answer, just say that you don't know, don't try to make up an answer.
+Context: {context}
+Question: {question}
+Only return the helpful answer below and nothing else.
+Helpful answer:
+"""
+prompt_template_with_history = """Use the following pieces of information to answer the user's question.
+If you don't know the answer, just say that you don't know, don't try to make up an answer.
+Use the history to answer the question if you can.
+Chat History:
+{chat_history}
+Context: {context}
+Question: {question}
+Only return the helpful answer below and nothing else.
+Helpful answer:
+"""

code/modules/data_loader.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import re
+import pysrt
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.document_loaders import (
+    PyMuPDFLoader,
+    Docx2txtLoader,
+    YoutubeLoader,
+    WebBaseLoader,
+    TextLoader,
+)
+from langchain.schema import Document
+from tempfile import NamedTemporaryFile
+import logging
+logger = logging.getLogger(__name__)
+class DataLoader:
+    def __init__(self, config):
+        """
+        Class for handling all data extraction and chunking
+        Inputs:
+            config - dictionary from yaml file, containing all important parameters
+        """
+        self.config = config
+        self.remove_leftover_delimiters = config["splitter_options"][
+            "remove_leftover_delimiters"
+        ]
+        # Main list of all documents
+        self.document_chunks_full = []
+        self.document_names = []
+        if config["splitter_options"]["use_splitter"]:
+            if config["splitter_options"]["split_by_token"]:
+                self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+                    chunk_size=config["splitter_options"]["chunk_size"],
+                    chunk_overlap=config["splitter_options"]["chunk_overlap"],
+                    separators=config["splitter_options"]["chunk_separators"],
+                )
+            else:
+                self.splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=config["splitter_options"]["chunk_size"],
+                    chunk_overlap=config["splitter_options"]["chunk_overlap"],
+                    separators=config["splitter_options"]["chunk_separators"],
+                )
+        else:
+            self.splitter = None
+        logger.info("InfoLoader instance created")
+    def get_chunks(self, uploaded_files, weblinks):
+        # Main list of all documents
+        self.document_chunks_full = []
+        self.document_names = []
+        def remove_delimiters(document_chunks: list):
+            """
+            Helper function to remove remaining delimiters in document chunks
+            """
+            for chunk in document_chunks:
+                for delimiter in self.config["splitter_options"][
+                    "delimiters_to_remove"
+                ]:
+                    chunk.page_content = re.sub(delimiter, " ", chunk.page_content)
+            return document_chunks
+        def remove_chunks(document_chunks: list):
+            """
+            Helper function to remove any unwanted document chunks after splitting
+            """
+            front = self.config["splitter_options"]["front_chunk_to_remove"]
+            end = self.config["splitter_options"]["last_chunks_to_remove"]
+            # Remove pages
+            for _ in range(front):
+                del document_chunks[0]
+            for _ in range(end):
+                document_chunks.pop()
+                logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
+            return document_chunks
+        def get_pdf(temp_file_path: str, title: str):
+            """
+            Function to process PDF files
+            """
+            loader = PyMuPDFLoader(
+                temp_file_path
+            )  # This loader preserves more metadata
+            if self.splitter:
+                document_chunks = self.splitter.split_documents(loader.load())
+            else:
+                document_chunks = loader.load()
+            if "title" in document_chunks[0].metadata.keys():
+                title = document_chunks[0].metadata["title"]
+            logger.info(
+                f"\t\tOriginal no. of pages: {document_chunks[0].metadata['total_pages']}"
+            )
+            return title, document_chunks
+        def get_txt(temp_file_path: str, title: str):
+            """
+            Function to process TXT files
+            """
+            loader = TextLoader(temp_file_path, autodetect_encoding=True)
+            if self.splitter:
+                document_chunks = self.splitter.split_documents(loader.load())
+            else:
+                document_chunks = loader.load()
+            # Update the metadata
+            for chunk in document_chunks:
+                chunk.metadata["source"] = title
+                chunk.metadata["page"] = "N/A"
+            return title, document_chunks
+        def get_srt(temp_file_path: str, title: str):
+            """
+            Function to process SRT files
+            """
+            subs = pysrt.open(temp_file_path)
+            text = ""
+            for sub in subs:
+                text += sub.text
+            document_chunks = [Document(page_content=text)]
+            if self.splitter:
+                document_chunks = self.splitter.split_documents(document_chunks)
+            # Update the metadata
+            for chunk in document_chunks:
+                chunk.metadata["source"] = title
+                chunk.metadata["page"] = "N/A"
+            return title, document_chunks
+        def get_docx(temp_file_path: str, title: str):
+            """
+            Function to process DOCX files
+            """
+            loader = Docx2txtLoader(temp_file_path)
+            if self.splitter:
+                document_chunks = self.splitter.split_documents(loader.load())
+            else:
+                document_chunks = loader.load()
+            # Update the metadata
+            for chunk in document_chunks:
+                chunk.metadata["source"] = title
+                chunk.metadata["page"] = "N/A"
+            return title, document_chunks
+        def get_youtube_transcript(url: str):
+            """
+            Function to retrieve youtube transcript and process text
+            """
+            loader = YoutubeLoader.from_youtube_url(
+                url, add_video_info=True, language=["en"], translation="en"
+            )
+            if self.splitter:
+                document_chunks = self.splitter.split_documents(loader.load())
+            else:
+                document_chunks = loader.load_and_split()
+            # Replace the source with title (for display in st UI later)
+            for chunk in document_chunks:
+                chunk.metadata["source"] = chunk.metadata["title"]
+            logger.info(chunk.metadata["title"])
+            return title, document_chunks
+        def get_html(url: str):
+            """
+            Function to process websites via HTML files
+            """
+            loader = WebBaseLoader(url)
+            if self.splitter:
+                document_chunks = self.splitter.split_documents(loader.load())
+            else:
+                document_chunks = loader.load_and_split()
+            title = document_chunks[0].metadata["title"]
+            logger.info(document_chunks[0].metadata)
+            return title, document_chunks
+        # Handle file by file
+        for file_index, file_path in enumerate(uploaded_files):
+            file_name = file_path.split("/")[-1]
+            file_type = file_name.split(".")[-1]
+            # Handle different file types
+            if file_type == "pdf":
+                title, document_chunks = get_pdf(file_path, file_name)
+            elif file_type == "txt":
+                title, document_chunks = get_txt(file_path, file_name)
+            elif file_type == "docx":
+                title, document_chunks = get_docx(file_path, file_name)
+            elif file_type == "srt":
+                title, document_chunks = get_srt(file_path, file_name)
+            # Additional wrangling - Remove leftover delimiters and any specified chunks
+            if self.remove_leftover_delimiters:
+                document_chunks = remove_delimiters(document_chunks)
+            if self.config["splitter_options"]["remove_chunks"]:
+                document_chunks = remove_chunks(document_chunks)
+            logger.info(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
+            self.document_names.append(title)
+            self.document_chunks_full.extend(document_chunks)
+        # Handle youtube links:
+        if weblinks[0] != "":
+            logger.info(f"Splitting weblinks: total of {len(weblinks)}")
+            # Handle link by link
+            for link_index, link in enumerate(weblinks):
+                logger.info(f"\tSplitting link {link_index+1} : {link}")
+                if "youtube" in link:
+                    title, document_chunks = get_youtube_transcript(link)
+                else:
+                    title, document_chunks = get_html(link)
+                # Additional wrangling - Remove leftover delimiters and any specified chunks
+                if self.remove_leftover_delimiters:
+                    document_chunks = remove_delimiters(document_chunks)
+                if self.config["splitter_options"]["remove_chunks"]:
+                    document_chunks = remove_chunks(document_chunks)
+                print(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
+                self.document_names.append(title)
+                self.document_chunks_full.extend(document_chunks)
+        logger.info(
+            f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
+        )
+        return self.document_chunks_full, self.document_names

code/modules/embedding_model_loader.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from langchain_community.embeddings import OpenAIEmbeddings
+from langchain.embeddings import HuggingFaceEmbeddings
+from modules.constants import *
+class EmbeddingModelLoader:
+    def __init__(self, config):
+        self.config = config
+    def load_embedding_model(self):
+        if self.config["embedding_options"]["model"] in ["text-embedding-ada-002"]:
+            embedding_model = OpenAIEmbeddings(
+                deployment="SL-document_embedder",
+                model=self.config["embedding_options"]["model"],
+                show_progress_bar=True,
+                openai_api_key=OPENAI_API_KEY,
+            )
+        else:
+            embedding_model = HuggingFaceEmbeddings(
+                model_name="sentence-transformers/all-MiniLM-L6-v2",
+                model_kwargs={"device": "cpu"},
+            )
+        return embedding_model

code/modules/llm_tutor.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from langchain import PromptTemplate
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain_community.chat_models import ChatOpenAI
+from langchain_community.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.chains import RetrievalQA, ConversationalRetrievalChain
+from langchain.llms import CTransformers
+from langchain.memory import ConversationBufferMemory
+from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
+import os
+from modules.constants import *
+from modules.chat_model_loader import ChatModelLoader
+from modules.vector_db import VectorDB
+class LLMTutor:
+    def __init__(self, config, logger=None):
+        self.config = config
+        self.vector_db = VectorDB(config, logger=logger)
+        if self.config['embedding_options']['embedd_files']:
+            self.vector_db.create_database()
+            self.vector_db.save_database()
+    def set_custom_prompt(self):
+        """
+        Prompt template for QA retrieval for each vectorstore
+        """
+        if self.config["llm_params"]["use_history"]:
+            custom_prompt_template = prompt_template_with_history
+        else:
+            custom_prompt_template = prompt_template
+        prompt = PromptTemplate(
+            template=custom_prompt_template,
+            input_variables=["context", "chat_history", "question"],
+        )
+        # prompt = QA_PROMPT
+        return prompt
+    # Retrieval QA Chain
+    def retrieval_qa_chain(self, llm, prompt, db):
+        if self.config["llm_params"]["use_history"]:
+            memory = ConversationBufferMemory(
+                memory_key="chat_history", return_messages=True, output_key="answer"
+            )
+            qa_chain = ConversationalRetrievalChain.from_llm(
+                llm=llm,
+                chain_type="stuff",
+                retriever=db.as_retriever(search_kwargs={"k": self.config["embedding_options"]["search_top_k"]}),
+                return_source_documents=True,
+                memory=memory,
+                combine_docs_chain_kwargs={"prompt": prompt},
+            )
+        else:
+            qa_chain = RetrievalQA.from_chain_type(
+                llm=llm,
+                chain_type="stuff",
+                retriever=db.as_retriever(search_kwargs={"k": self.config["embedding_options"]["search_top_k"]}),
+                return_source_documents=True,
+                chain_type_kwargs={"prompt": prompt},
+            )
+        return qa_chain
+    # Loading the model
+    def load_llm(self):
+        chat_model_loader = ChatModelLoader(self.config)
+        llm = chat_model_loader.load_chat_model()
+        return llm
+    # QA Model Function
+    def qa_bot(self):
+        db = self.vector_db.load_database()
+        self.llm = self.load_llm()
+        qa_prompt = self.set_custom_prompt()
+        qa = self.retrieval_qa_chain(self.llm, qa_prompt, db)
+        return qa
+    # output function
+    def final_result(query):
+        qa_result = qa_bot()
+        response = qa_result({"query": query})
+        return response

code/modules/vector_db.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import logging
+import os
+import yaml
+from modules.embedding_model_loader import EmbeddingModelLoader
+from langchain.vectorstores import FAISS
+from modules.data_loader import DataLoader
+from modules.constants import *
+class VectorDB:
+    def __init__(self, config, logger=None):
+        self.config = config
+        self.db_option = config["embedding_options"]["db_option"]
+        self.document_names = None
+        # Set up logging to both console and a file
+        if logger is None:
+            self.logger = logging.getLogger(__name__)
+            self.logger.setLevel(logging.INFO)
+            # Console Handler
+            console_handler = logging.StreamHandler()
+            console_handler.setLevel(logging.INFO)
+            formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+            console_handler.setFormatter(formatter)
+            self.logger.addHandler(console_handler)
+            # File Handler
+            log_file_path = "vector_db.log"  # Change this to your desired log file path
+            file_handler = logging.FileHandler(log_file_path, mode="w")
+            file_handler.setLevel(logging.INFO)
+            file_handler.setFormatter(formatter)
+            self.logger.addHandler(file_handler)
+        else:
+            self.logger = logger
+        self.logger.info("VectorDB instance instantiated")
+    def load_files(self):
+        files = os.listdir(self.config["embedding_options"]["data_path"])
+        files = [
+            os.path.join(self.config["embedding_options"]["data_path"], file)
+            for file in files
+        ]
+        return files
+    def create_embedding_model(self):
+        self.logger.info("Creating embedding function")
+        self.embedding_model_loader = EmbeddingModelLoader(self.config)
+        self.embedding_model = self.embedding_model_loader.load_embedding_model()
+    def initialize_database(self, document_chunks: list, document_names: list):
+        # Track token usage
+        self.logger.info("Initializing vector_db")
+        self.logger.info("\tUsing {} as db_option".format(self.db_option))
+        if self.db_option == "FAISS":
+            self.vector_db = FAISS.from_documents(
+                documents=document_chunks, embedding=self.embedding_model
+            )
+        self.logger.info("Completed initializing vector_db")
+    def create_database(self):
+        data_loader = DataLoader(self.config)
+        self.logger.info("Loading data")
+        files = self.load_files()
+        document_chunks, document_names = data_loader.get_chunks(files, [""])
+        self.logger.info("Completed loading data")
+        self.create_embedding_model()
+        self.initialize_database(document_chunks, document_names)
+    def save_database(self):
+        self.vector_db.save_local(
+            os.path.join(
+                self.config["embedding_options"]["db_path"],
+                "db_"
+                + self.config["embedding_options"]["db_option"]
+                + "_"
+                + self.config["embedding_options"]["model"],
+            )
+        )
+        self.logger.info("Saved database")
+    def load_database(self):
+        self.create_embedding_model()
+        self.vector_db = FAISS.load_local(
+            os.path.join(
+                self.config["embedding_options"]["db_path"],
+                "db_"
+                + self.config["embedding_options"]["db_option"]
+                + "_"
+                + self.config["embedding_options"]["model"],
+            ),
+            self.embedding_model,
+        )
+        self.logger.info("Loaded database")
+        return self.vector_db
+if __name__ == "__main__":
+    with open("config.yml", "r") as f:
+        config = yaml.safe_load(f)
+    print(config)
+    vector_db = VectorDB(config)
+    vector_db.create_database()
+    vector_db.save_database()

code/vectorstores/db_FAISS_sentence-transformers/all-MiniLM-L6-v2/index.faiss ADDED Viewed

Binary file (6.19 kB). View file

code/vectorstores/db_FAISS_sentence-transformers/all-MiniLM-L6-v2/index.pkl ADDED Viewed

Binary file (9.21 kB). View file

code/vectorstores/db_FAISS_text-embedding-ada-002/index.faiss ADDED Viewed

Binary file (24.6 kB). View file

code/vectorstores/db_FAISS_text-embedding-ada-002/index.pkl ADDED Viewed

Binary file (9.21 kB). View file

data/webpage.pdf ADDED Viewed

Binary file (51.3 kB). View file

docs/README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Documentation
2	+

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+streamlit==1.29.0
+PyYAML==6.0.1
+pysrt==1.1.2
+langchain==0.0.353
+tiktoken==0.5.2
+streamlit-chat==0.1.1
+pypdf==3.17.4
+sentence-transformers==2.2.2
+faiss-cpu==1.7.4
+ctransformers==0.2.27
+python-dotenv==1.0.0
+openai==1.6.1
+pymupdf==1.23.8
+chainlit==0.7.700