Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

XThomasBU commited on May 29

Commit

57b7b8d

•

1 Parent(s): fe158b7

modularied dataloader + Added Chroma

Browse files

Files changed (6) hide show

code/config.yml +2 -2
code/modules/data_loader.py +165 -223
code/modules/embedding_model_loader.py +2 -0
code/modules/helpers.py +11 -7
code/modules/llm_tutor.py +15 -13
code/modules/vector_db.py +75 -22

code/config.yml CHANGED Viewed

@@ -1,6 +1,5 @@
 embedding_options:
   embedd_files: False # bool
-  persist_directory: null # str or None
   data_path: 'storage/data' # str
   url_file_path: 'storage/data/urls.txt' # str
   expand_urls: True # bool
@@ -8,8 +7,9 @@ embedding_options:
   db_path : 'vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
   search_top_k : 3 # int
 llm_params:
-  use_history: False # bool
   memory_window: 3 # int
   llm_loader: 'local_llm' # str [local_llm, openai]
   openai_params:

 embedding_options:
   embedd_files: False # bool
   data_path: 'storage/data' # str
   url_file_path: 'storage/data/urls.txt' # str
   expand_urls: True # bool
   db_path : 'vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
   search_top_k : 3 # int
+  score_threshold : 0.5 # float
 llm_params:
+  use_history: True # bool
   memory_window: 3 # int
   llm_loader: 'local_llm' # str [local_llm, openai]
   openai_params:

code/modules/data_loader.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import re
 import pysrt
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.document_loaders import (
     PyMuPDFLoader,
     Docx2txtLoader,
@@ -8,49 +9,32 @@ from langchain.document_loaders import (
     WebBaseLoader,
     TextLoader,
 )
 from langchain.schema import Document
-import tempfile
-from tempfile import NamedTemporaryFile
 import logging
-import requests
 logger = logging.getLogger(__name__)
-class DataLoader:
-    def __init__(self, config):
-        """
-        Class for handling all data extraction and chunking
-        Inputs:
-            config - dictionary from yaml file, containing all important parameters
-        """
-        self.config = config
-        self.remove_leftover_delimiters = config["splitter_options"][
-            "remove_leftover_delimiters"
-        ]
-        # Main list of all documents
-        self.document_chunks_full = []
-        self.document_names = []
-        if config["splitter_options"]["use_splitter"]:
-            if config["splitter_options"]["split_by_token"]:
-                self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-                    chunk_size=config["splitter_options"]["chunk_size"],
-                    chunk_overlap=config["splitter_options"]["chunk_overlap"],
-                    separators=config["splitter_options"]["chunk_separators"],
-                    disallowed_special=()
-                )
-            else:
-                self.splitter = RecursiveCharacterTextSplitter(
-                    chunk_size=config["splitter_options"]["chunk_size"],
-                    chunk_overlap=config["splitter_options"]["chunk_overlap"],
-                    separators=config["splitter_options"]["chunk_separators"],
-                    disallowed_special=()
-                )
-        else:
-            self.splitter = None
-        logger.info("InfoLoader instance created")
     def extract_text_from_pdf(self, pdf_path):
         text = ""
@@ -73,215 +57,173 @@ class DataLoader:
             print("Failed to download PDF from URL:", pdf_url)
             return None
-    def get_chunks(self, uploaded_files, weblinks):
-        # Main list of all documents
-        self.document_chunks_full = []
-        self.document_names = []
-        def remove_delimiters(document_chunks: list):
-            """
-            Helper function to remove remaining delimiters in document chunks
-            """
-            for chunk in document_chunks:
-                for delimiter in self.config["splitter_options"][
-                    "delimiters_to_remove"
-                ]:
-                    chunk.page_content = re.sub(delimiter, " ", chunk.page_content)
-            return document_chunks
-        def remove_chunks(document_chunks: list):
-            """
-            Helper function to remove any unwanted document chunks after splitting
-            """
-            front = self.config["splitter_options"]["front_chunk_to_remove"]
-            end = self.config["splitter_options"]["last_chunks_to_remove"]
-            # Remove pages
-            for _ in range(front):
-                del document_chunks[0]
-            for _ in range(end):
-                document_chunks.pop()
-                logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
-            return document_chunks
-        def get_pdf_from_url(pdf_url: str):
-            temp_pdf_path = self.download_pdf_from_url(pdf_url)
-            if temp_pdf_path:
-                title, document_chunks = get_pdf(temp_pdf_path, pdf_url)
-                os.remove(temp_pdf_path)
-                return title, document_chunks
-        def get_pdf(temp_file_path: str, title: str):
-            """
-            Function to process PDF files
-            """
-            loader = PyMuPDFLoader(
-                temp_file_path
-            )  # This loader preserves more metadata
-            if self.splitter:
-                document_chunks = self.splitter.split_documents(loader.load())
-            else:
-                document_chunks = loader.load()
-            if "title" in document_chunks[0].metadata.keys():
-                title = document_chunks[0].metadata["title"]
-            logger.info(
-                f"\t\tOriginal no. of pages: {document_chunks[0].metadata['total_pages']}"
-            )
-            return title, document_chunks
-        def get_txt(temp_file_path: str, title: str):
-            """
-            Function to process TXT files
-            """
-            loader = TextLoader(temp_file_path, autodetect_encoding=True)
-            if self.splitter:
-                document_chunks = self.splitter.split_documents(loader.load())
-            else:
-                document_chunks = loader.load()
-            # Update the metadata
-            for chunk in document_chunks:
-                chunk.metadata["source"] = title
-                chunk.metadata["page"] = "N/A"
-            return title, document_chunks
-        def get_srt(temp_file_path: str, title: str):
-            """
-            Function to process SRT files
-            """
-            subs = pysrt.open(temp_file_path)
-            text = ""
-            for sub in subs:
-                text += sub.text
-            document_chunks = [Document(page_content=text)]
-            if self.splitter:
-                document_chunks = self.splitter.split_documents(document_chunks)
-            # Update the metadata
-            for chunk in document_chunks:
-                chunk.metadata["source"] = title
-                chunk.metadata["page"] = "N/A"
-            return title, document_chunks
-        def get_docx(temp_file_path: str, title: str):
-            """
-            Function to process DOCX files
-            """
-            loader = Docx2txtLoader(temp_file_path)
-            if self.splitter:
-                document_chunks = self.splitter.split_documents(loader.load())
-            else:
-                document_chunks = loader.load()
-            # Update the metadata
-            for chunk in document_chunks:
-                chunk.metadata["source"] = title
-                chunk.metadata["page"] = "N/A"
-            return title, document_chunks
-        def get_youtube_transcript(url: str):
-            """
-            Function to retrieve youtube transcript and process text
-            """
-            loader = YoutubeLoader.from_youtube_url(
-                url, add_video_info=True, language=["en"], translation="en"
-            )
-            if self.splitter:
-                document_chunks = self.splitter.split_documents(loader.load())
             else:
-                document_chunks = loader.load_and_split()
-            # Replace the source with title (for display in st UI later)
-            for chunk in document_chunks:
-                chunk.metadata["source"] = chunk.metadata["title"]
-            logger.info(chunk.metadata["title"])
-            return title, document_chunks
-        def get_html(url: str):
-            """
-            Function to process websites via HTML files
-            """
-            loader = WebBaseLoader(url)
-            if self.splitter:
-                document_chunks = self.splitter.split_documents(loader.load())
-            else:
-                document_chunks = loader.load_and_split()
-            title = document_chunks[0].metadata["title"]
-            logger.info(document_chunks[0].metadata)
-            return title, document_chunks
-        # Handle file by file
         for file_index, file_path in enumerate(uploaded_files):
-            file_name = file_path.split("/")[-1]
-            file_type = file_name.split(".")[-1]
-            # Handle different file types
-            if file_type == "pdf":
-                try:
-                    title, document_chunks = get_pdf(file_path, file_name)
-                except:
-                    title, document_chunks = get_pdf_from_url(file_path)
-            elif file_type == "txt":
-                title, document_chunks = get_txt(file_path, file_name)
-            elif file_type == "docx":
-                title, document_chunks = get_docx(file_path, file_name)
-            elif file_type == "srt":
-                title, document_chunks = get_srt(file_path, file_name)
-            # Additional wrangling - Remove leftover delimiters and any specified chunks
-            if self.remove_leftover_delimiters:
-                document_chunks = remove_delimiters(document_chunks)
-            if self.config["splitter_options"]["remove_chunks"]:
-                document_chunks = remove_chunks(document_chunks)
-            logger.info(f"\t\tExtracted no. of chunks: {len(document_chunks)} from {file_name}")
-            self.document_names.append(title)
-            self.document_chunks_full.extend(document_chunks)
-        # Handle youtube links:
         if weblinks[0] != "":
             logger.info(f"Splitting weblinks: total of {len(weblinks)}")
-            # Handle link by link
             for link_index, link in enumerate(weblinks):
                 try:
                     logger.info(f"\tSplitting link {link_index+1} : {link}")
                     if "youtube" in link:
-                        title, document_chunks = get_youtube_transcript(link)
                     else:
-                        title, document_chunks = get_html(link)
-                    # Additional wrangling - Remove leftover delimiters and any specified chunks
-                    if self.remove_leftover_delimiters:
-                        document_chunks = remove_delimiters(document_chunks)
-                    if self.config["splitter_options"]["remove_chunks"]:
-                        document_chunks = remove_chunks(document_chunks)
-                    print(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
-                    self.document_names.append(title)
                     self.document_chunks_full.extend(document_chunks)
-                except:
-                    logger.info(f"\t\tError splitting link {link_index+1} : {link}")
-                    exit()
-        logger.info(
-            f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
-        )
-        return self.document_chunks_full, self.document_names

+import os
 import re
+import requests
 import pysrt
 from langchain.document_loaders import (
     PyMuPDFLoader,
     Docx2txtLoader,
     WebBaseLoader,
     TextLoader,
 )
+from langchain_community.document_loaders import UnstructuredMarkdownLoader
+from llama_parse import LlamaParse
 from langchain.schema import Document
 import logging
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_openai.embeddings import OpenAIEmbeddings
 logger = logging.getLogger(__name__)
+class PDFReader:
+    def __init__(self):
+        pass
+    def get_loader(self, pdf_path):
+        loader = PyMuPDFLoader(pdf_path)
+        return loader
+    def get_documents(self, loader):
+        return loader.load()
+class FileReader:
+    def __init__(self):
+        self.pdf_reader = PDFReader()
     def extract_text_from_pdf(self, pdf_path):
         text = ""
             print("Failed to download PDF from URL:", pdf_url)
             return None
+    def read_pdf(self, temp_file_path: str):
+        # parser = LlamaParse(
+        #     api_key="",
+        #     result_type="markdown",
+        #     num_workers=4,
+        #     verbose=True,
+        #     language="en",
+        # )
+        # documents = parser.load_data(temp_file_path)
+        # with open("temp/output.md", "a") as f:
+        #     for doc in documents:
+        #         f.write(doc.text + "\n")
+        # markdown_path = "temp/output.md"
+        # loader = UnstructuredMarkdownLoader(markdown_path)
+        # loader = PyMuPDFLoader(temp_file_path)  # This loader preserves more metadata
+        # return loader.load()
+        loader = self.pdf_reader.get_loader(temp_file_path)
+        documents = self.pdf_reader.get_documents(loader)
+        return documents
+    def read_txt(self, temp_file_path: str):
+        loader = TextLoader(temp_file_path, autodetect_encoding=True)
+        return loader.load()
+    def read_docx(self, temp_file_path: str):
+        loader = Docx2txtLoader(temp_file_path)
+        return loader.load()
+    def read_srt(self, temp_file_path: str):
+        subs = pysrt.open(temp_file_path)
+        text = ""
+        for sub in subs:
+            text += sub.text
+        return [Document(page_content=text)]
+    def read_youtube_transcript(self, url: str):
+        loader = YoutubeLoader.from_youtube_url(
+            url, add_video_info=True, language=["en"], translation="en"
+        )
+        return loader.load()
+    def read_html(self, url: str):
+        loader = WebBaseLoader(url)
+        return loader.load()
+class ChunkProcessor:
+    def __init__(self, config):
+        self.config = config
+        self.remove_leftover_delimiters = config["splitter_options"][
+            "remove_leftover_delimiters"
+        ]
+        self.document_chunks_full = []
+        self.document_names = []
+        if config["splitter_options"]["use_splitter"]:
+            if config["splitter_options"]["split_by_token"]:
+                self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+                    chunk_size=config["splitter_options"]["chunk_size"],
+                    chunk_overlap=config["splitter_options"]["chunk_overlap"],
+                    separators=config["splitter_options"]["chunk_separators"],
+                    disallowed_special=(),
+                )
             else:
+                self.splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=config["splitter_options"]["chunk_size"],
+                    chunk_overlap=config["splitter_options"]["chunk_overlap"],
+                    separators=config["splitter_options"]["chunk_separators"],
+                    disallowed_special=(),
+                )
+        else:
+            self.splitter = None
+        logger.info("ChunkProcessor instance created")
+    def remove_delimiters(self, document_chunks: list):
+        for chunk in document_chunks:
+            for delimiter in self.config["splitter_options"]["delimiters_to_remove"]:
+                chunk.page_content = re.sub(delimiter, " ", chunk.page_content)
+        return document_chunks
+    def remove_chunks(self, document_chunks: list):
+        front = self.config["splitter_options"]["front_chunk_to_remove"]
+        end = self.config["splitter_options"]["last_chunks_to_remove"]
+        for _ in range(front):
+            del document_chunks[0]
+        for _ in range(end):
+            document_chunks.pop()
+        logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
+        return document_chunks
+    def process_chunks(self, documents):
+        if self.splitter:
+            document_chunks = self.splitter.split_documents(documents)
+        else:
+            document_chunks = documents
+        if self.remove_leftover_delimiters:
+            document_chunks = self.remove_delimiters(document_chunks)
+        if self.config["splitter_options"]["remove_chunks"]:
+            document_chunks = self.remove_chunks(document_chunks)
+        return document_chunks
+    def get_chunks(self, file_reader, uploaded_files, weblinks):
+        self.document_chunks_full = []
+        self.document_names = []
         for file_index, file_path in enumerate(uploaded_files):
+            file_name = os.path.basename(file_path)
+            file_type = file_name.split(".")[-1].lower()
+            try:
+                if file_type == "pdf":
+                    documents = file_reader.read_pdf(file_path)
+                elif file_type == "txt":
+                    documents = file_reader.read_txt(file_path)
+                elif file_type == "docx":
+                    documents = file_reader.read_docx(file_path)
+                elif file_type == "srt":
+                    documents = file_reader.read_srt(file_path)
+                else:
+                    logger.warning(f"Unsupported file type: {file_type}")
+                    continue
+                document_chunks = self.process_chunks(documents)
+                self.document_names.append(file_name)
+                self.document_chunks_full.extend(document_chunks)
+            except Exception as e:
+                logger.error(f"Error processing file {file_name}: {str(e)}")
+        self.process_weblinks(file_reader, weblinks)
+        logger.info(
+            f"Total document chunks extracted: {len(self.document_chunks_full)}"
+        )
+        return self.document_chunks_full, self.document_names
+    def process_weblinks(self, file_reader, weblinks):
         if weblinks[0] != "":
             logger.info(f"Splitting weblinks: total of {len(weblinks)}")
             for link_index, link in enumerate(weblinks):
                 try:
                     logger.info(f"\tSplitting link {link_index+1} : {link}")
                     if "youtube" in link:
+                        documents = file_reader.read_youtube_transcript(link)
                     else:
+                        documents = file_reader.read_html(link)
+                    document_chunks = self.process_chunks(documents)
+                    self.document_names.append(link)
                     self.document_chunks_full.extend(document_chunks)
+                except Exception as e:
+                    logger.error(
+                        f"Error splitting link {link_index+1} : {link}: {str(e)}"
+                    )
+class DataLoader:
+    def __init__(self, config):
+        self.file_reader = FileReader()
+        self.chunk_processor = ChunkProcessor(config)
+    def get_chunks(self, uploaded_files, weblinks):
+        return self.chunk_processor.get_chunks(
+            self.file_reader, uploaded_files, weblinks
+        )

code/modules/embedding_model_loader.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.embeddings import LlamaCppEmbeddings
 try:
     from modules.constants import *
 except:
@@ -19,6 +20,7 @@ class EmbeddingModelLoader:
                 model=self.config["embedding_options"]["model"],
                 show_progress_bar=True,
                 openai_api_key=OPENAI_API_KEY,
             )
         else:
             embedding_model = HuggingFaceEmbeddings(

 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.embeddings import LlamaCppEmbeddings
 try:
     from modules.constants import *
 except:
                 model=self.config["embedding_options"]["model"],
                 show_progress_bar=True,
                 openai_api_key=OPENAI_API_KEY,
+                disallowed_special=(),
             )
         else:
             embedding_model = HuggingFaceEmbeddings(

code/modules/helpers.py CHANGED Viewed

@@ -4,6 +4,7 @@ from tqdm import tqdm
 from urllib.parse import urlparse
 import chainlit as cl
 from langchain import PromptTemplate
 try:
     from modules.constants import *
 except:
@@ -60,7 +61,7 @@ class WebpageCrawler:
     def get_subpage_links(self, l, base_url):
         for link in tqdm(l):
-            print('checking link:', link)
             if not link.endswith("/"):
                 l[link] = "Checked"
                 dict_links_subpages = {}
@@ -109,6 +110,7 @@ def get_base_url(url):
     base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
     return base_url
 def get_prompt(config):
     if config["llm_params"]["use_history"]:
         if config["llm_params"]["llm_loader"] == "local_llm":
@@ -134,6 +136,7 @@ def get_prompt(config):
         )
     return prompt
 def get_sources(res, answer):
     source_elements_dict = {}
     source_elements = []
@@ -144,21 +147,22 @@ def get_sources(res, answer):
     for idx, source in enumerate(res["source_documents"]):
         source_metadata = source.metadata
         url = source_metadata["source"]
         if url not in source_dict:
-            source_dict[url] = [source.page_content]
         else:
-            source_dict[url].append(source.page_content)
     for source_idx, (url, text_list) in enumerate(source_dict.items()):
         full_text = ""
-        for url_idx, text in enumerate(text_list):
-            full_text += f"Source {url_idx+1}:\n {text}\n\n\n"
         source_elements.append(cl.Text(name=url, content=full_text))
-        found_sources.append(url)
     if found_sources:
-        answer += f"\n\nSources: {', '.join(found_sources)} "
     else:
         answer += f"\n\nNo source found."

 from urllib.parse import urlparse
 import chainlit as cl
 from langchain import PromptTemplate
 try:
     from modules.constants import *
 except:
     def get_subpage_links(self, l, base_url):
         for link in tqdm(l):
+            print("checking link:", link)
             if not link.endswith("/"):
                 l[link] = "Checked"
                 dict_links_subpages = {}
     base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
     return base_url
 def get_prompt(config):
     if config["llm_params"]["use_history"]:
         if config["llm_params"]["llm_loader"] == "local_llm":
         )
     return prompt
 def get_sources(res, answer):
     source_elements_dict = {}
     source_elements = []
     for idx, source in enumerate(res["source_documents"]):
         source_metadata = source.metadata
         url = source_metadata["source"]
+        score = source_metadata.get("score", "N/A")
         if url not in source_dict:
+            source_dict[url] = [(source.page_content, score)]
         else:
+            source_dict[url].append((source.page_content, score))
     for source_idx, (url, text_list) in enumerate(source_dict.items()):
         full_text = ""
+        for url_idx, (text, score) in enumerate(text_list):
+            full_text += f"Source {url_idx + 1} (Score: {score}):\n{text}\n\n\n"
         source_elements.append(cl.Text(name=url, content=full_text))
+        found_sources.append(f"{url} (Score: {score})")
     if found_sources:
+        answer += f"\n\nSources: {', '.join(found_sources)}"
     else:
         answer += f"\n\nNo source found."

code/modules/llm_tutor.py CHANGED Viewed

@@ -12,7 +12,7 @@ import os
 from modules.constants import *
 from modules.helpers import get_prompt
 from modules.chat_model_loader import ChatModelLoader
-from modules.vector_db import VectorDB
 class LLMTutor:
@@ -34,19 +34,25 @@ class LLMTutor:
     # Retrieval QA Chain
     def retrieval_qa_chain(self, llm, prompt, db):
         if self.config["llm_params"]["use_history"]:
             memory = ConversationBufferWindowMemory(
-            k = self.config["llm_params"]["memory_window"],
-            memory_key="chat_history", return_messages=True, output_key="answer"
             )
             qa_chain = ConversationalRetrievalChain.from_llm(
                 llm=llm,
                 chain_type="stuff",
-                retriever=db.as_retriever(
-                    search_kwargs={
-                        "k": self.config["embedding_options"]["search_top_k"]
-                    }
-                ),
                 return_source_documents=True,
                 memory=memory,
                 combine_docs_chain_kwargs={"prompt": prompt},
@@ -55,11 +61,7 @@ class LLMTutor:
             qa_chain = RetrievalQA.from_chain_type(
                 llm=llm,
                 chain_type="stuff",
-                retriever=db.as_retriever(
-                    search_kwargs={
-                        "k": self.config["embedding_options"]["search_top_k"]
-                    }
-                ),
                 return_source_documents=True,
                 chain_type_kwargs={"prompt": prompt},
             )

 from modules.constants import *
 from modules.helpers import get_prompt
 from modules.chat_model_loader import ChatModelLoader
+from modules.vector_db import VectorDB, VectorDBScore
 class LLMTutor:
     # Retrieval QA Chain
     def retrieval_qa_chain(self, llm, prompt, db):
+        retriever = VectorDBScore(
+            vectorstore=db,
+            search_type="similarity_score_threshold",
+            search_kwargs={
+                "score_threshold": self.config["embedding_options"]["score_threshold"],
+                "k": self.config["embedding_options"]["search_top_k"],
+            },
+        )
         if self.config["llm_params"]["use_history"]:
             memory = ConversationBufferWindowMemory(
+                k=self.config["llm_params"]["memory_window"],
+                memory_key="chat_history",
+                return_messages=True,
+                output_key="answer",
             )
             qa_chain = ConversationalRetrievalChain.from_llm(
                 llm=llm,
                 chain_type="stuff",
+                retriever=retriever,
                 return_source_documents=True,
                 memory=memory,
                 combine_docs_chain_kwargs={"prompt": prompt},
             qa_chain = RetrievalQA.from_chain_type(
                 llm=llm,
                 chain_type="stuff",
+                retriever=retriever,
                 return_source_documents=True,
                 chain_type_kwargs={"prompt": prompt},
             )

code/modules/vector_db.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import logging
 import os
 import yaml
-from langchain.vectorstores import FAISS
 try:
     from modules.embedding_model_loader import EmbeddingModelLoader
@@ -15,6 +18,24 @@ except:
     from helpers import *
 class VectorDB:
     def __init__(self, config, logger=None):
         self.config = config
@@ -61,10 +82,12 @@ class VectorDB:
         return files, urls
     def clean_url_list(self, urls):
-        # get lecture pdf links
         lecture_pdfs = [link for link in urls if link.endswith(".pdf")]
         lecture_pdfs = [link for link in lecture_pdfs if "lecture" in link.lower()]
-        urls = [link for link in urls if link.endswith("/")] # only keep links that end with a '/'. Extract Files Seperately
         return urls, lecture_pdfs
@@ -81,6 +104,18 @@ class VectorDB:
             self.vector_db = FAISS.from_documents(
                 documents=document_chunks, embedding=self.embedding_model
             )
         self.logger.info("Completed initializing vector_db")
     def create_database(self):
@@ -89,7 +124,8 @@ class VectorDB:
         files, urls = self.load_files()
         urls, lecture_pdfs = self.clean_url_list(urls)
         files += lecture_pdfs
-        files.remove('storage/data/urls.txt')
         document_chunks, document_names = data_loader.get_chunks(files, urls)
         self.logger.info("Completed loading data")
@@ -97,29 +133,46 @@ class VectorDB:
         self.initialize_database(document_chunks, document_names)
     def save_database(self):
-        self.vector_db.save_local(
-            os.path.join(
-                self.config["embedding_options"]["db_path"],
-                "db_"
-                + self.config["embedding_options"]["db_option"]
-                + "_"
-                + self.config["embedding_options"]["model"],
             )
-        )
         self.logger.info("Saved database")
     def load_database(self):
         self.create_embedding_model()
-        self.vector_db = FAISS.load_local(
-            os.path.join(
-                self.config["embedding_options"]["db_path"],
-                "db_"
-                + self.config["embedding_options"]["db_option"]
-                + "_"
-                + self.config["embedding_options"]["model"],
-            ),
-            self.embedding_model,
-        )
         self.logger.info("Loaded database")
         return self.vector_db

 import logging
 import os
 import yaml
+from langchain.vectorstores import FAISS, Chroma
+from langchain.schema.vectorstore import VectorStoreRetriever
+from langchain.callbacks.manager import CallbackManagerForRetrieverRun
+from langchain.schema.document import Document
 try:
     from modules.embedding_model_loader import EmbeddingModelLoader
     from helpers import *
+class VectorDBScore(VectorStoreRetriever):
+    # See https://github.com/langchain-ai/langchain/blob/61dd92f8215daef3d9cf1734b0d1f8c70c1571c3/libs/langchain/langchain/vectorstores/base.py#L500
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ):
+        docs_and_similarities = (
+            self.vectorstore.similarity_search_with_relevance_scores(
+                query, **self.search_kwargs
+            )
+        )
+        # Make the score part of the document metadata
+        for doc, similarity in docs_and_similarities:
+            doc.metadata["score"] = similarity
+        docs = [doc for doc, _ in docs_and_similarities]
+        return docs
 class VectorDB:
     def __init__(self, config, logger=None):
         self.config = config
         return files, urls
     def clean_url_list(self, urls):
+        # get lecture pdf links
         lecture_pdfs = [link for link in urls if link.endswith(".pdf")]
         lecture_pdfs = [link for link in lecture_pdfs if "lecture" in link.lower()]
+        urls = [
+            link for link in urls if link.endswith("/")
+        ]  # only keep links that end with a '/'. Extract Files Seperately
         return urls, lecture_pdfs
             self.vector_db = FAISS.from_documents(
                 documents=document_chunks, embedding=self.embedding_model
             )
+        elif self.db_option == "Chroma":
+            self.vector_db = Chroma.from_documents(
+                documents=document_chunks,
+                embedding=self.embedding_model,
+                persist_directory=os.path.join(
+                    self.config["embedding_options"]["db_path"],
+                    "db_"
+                    + self.config["embedding_options"]["db_option"]
+                    + "_"
+                    + self.config["embedding_options"]["model"],
+                ),
+            )
         self.logger.info("Completed initializing vector_db")
     def create_database(self):
         files, urls = self.load_files()
         urls, lecture_pdfs = self.clean_url_list(urls)
         files += lecture_pdfs
+        if "storage/data/urls.txt" in files:
+            files.remove("storage/data/urls.txt")
         document_chunks, document_names = data_loader.get_chunks(files, urls)
         self.logger.info("Completed loading data")
         self.initialize_database(document_chunks, document_names)
     def save_database(self):
+        if self.db_option == "FAISS":
+            self.vector_db.save_local(
+                os.path.join(
+                    self.config["embedding_options"]["db_path"],
+                    "db_"
+                    + self.config["embedding_options"]["db_option"]
+                    + "_"
+                    + self.config["embedding_options"]["model"],
+                )
             )
+        elif self.db_option == "Chroma":
+            # db is saved in the persist directory during initialization
+            pass
         self.logger.info("Saved database")
     def load_database(self):
         self.create_embedding_model()
+        if self.db_option == "FAISS":
+            self.vector_db = FAISS.load_local(
+                os.path.join(
+                    self.config["embedding_options"]["db_path"],
+                    "db_"
+                    + self.config["embedding_options"]["db_option"]
+                    + "_"
+                    + self.config["embedding_options"]["model"],
+                ),
+                self.embedding_model,
+                allow_dangerous_deserialization=True,
+            )
+        elif self.db_option == "Chroma":
+            self.vector_db = Chroma(
+                persist_directory=os.path.join(
+                    self.config["embedding_options"]["db_path"],
+                    "db_"
+                    + self.config["embedding_options"]["db_option"]
+                    + "_"
+                    + self.config["embedding_options"]["model"],
+                ),
+                embedding_function=self.embedding_model,
+            )
         self.logger.info("Loaded database")
         return self.vector_db