Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

Thomas (Tom) Gardos commited on Aug 2

Commit

166d2a9

•

2 Parent(s): ccfbb8c 1052297

Merge pull request #50 from DL4DS/text_extraction

Browse files

Files changed (5) hide show

code/main.py +11 -1
code/modules/config/config.yml +2 -0
code/modules/dataloader/data_loader.py +45 -41
code/modules/dataloader/pdf_readers/gpt.py +1 -1
code/modules/vectorstore/faiss.py +6 -14

code/main.py CHANGED Viewed

@@ -67,16 +67,19 @@ class Chatbot:
     async def setup_llm(self):
         """
         Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
         """
         start_time = time.time()
         llm_settings = cl.user_session.get("llm_settings", {})
-        chat_profile, retriever_method, memory_window, llm_style, generate_follow_up = (
             llm_settings.get("chat_model"),
             llm_settings.get("retriever_method"),
             llm_settings.get("memory_window"),
             llm_settings.get("llm_style"),
             llm_settings.get("follow_up_questions"),
         )
         chain = cl.user_session.get("chain")
@@ -96,6 +99,7 @@ class Chatbot:
         self.config["llm_params"]["llm_style"] = llm_style
         self.config["llm_params"]["llm_loader"] = chat_profile
         self.config["llm_params"]["generate_follow_up"] = generate_follow_up
         self.llm_tutor.update_llm(
             old_config, self.config
@@ -173,6 +177,12 @@ class Chatbot:
                     label="Stream response",
                     initial=config["llm_params"]["stream"],
                 ),
                 cl.input_widget.Switch(
                     id="follow_up_questions",
                     label="Generate follow up questions",

     async def setup_llm(self):
         """
         Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
+        #TODO: Clean this up.
         """
         start_time = time.time()
         llm_settings = cl.user_session.get("llm_settings", {})
+        chat_profile, retriever_method, memory_window, llm_style, generate_follow_up, chunking_mode = (
             llm_settings.get("chat_model"),
             llm_settings.get("retriever_method"),
             llm_settings.get("memory_window"),
             llm_settings.get("llm_style"),
             llm_settings.get("follow_up_questions"),
+            llm_settings.get("chunking_mode"),
         )
         chain = cl.user_session.get("chain")
         self.config["llm_params"]["llm_style"] = llm_style
         self.config["llm_params"]["llm_loader"] = chat_profile
         self.config["llm_params"]["generate_follow_up"] = generate_follow_up
+        self.config["splitter_options"]["chunking_mode"] = chunking_mode
         self.llm_tutor.update_llm(
             old_config, self.config
                     label="Stream response",
                     initial=config["llm_params"]["stream"],
                 ),
+                cl.input_widget.Select(
+                    id="chunking_mode",
+                    label="Chunking mode",
+                    values=['fixed', 'semantic'],
+                    initial_index=1,
+                ),
                 cl.input_widget.Switch(
                     id="follow_up_questions",
                     label="Generate follow up questions",

code/modules/config/config.yml CHANGED Viewed

@@ -39,6 +39,7 @@ llm_params:
     filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
   pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
   stream: False # bool
 chat_logging:
   log_chat: True # bool
@@ -50,6 +51,7 @@ splitter_options:
   split_by_token : True # bool
   remove_leftover_delimiters: True # bool
   remove_chunks: False # bool
   chunk_size : 300 # int
   chunk_overlap : 30 # int
   chunk_separators : ["\n\n", "\n", " ", ""] # list of strings

     filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
   pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
   stream: False # bool
+  pdf_reader: 'gpt' # str [llama, pymupdf, gpt]
 chat_logging:
   log_chat: True # bool
   split_by_token : True # bool
   remove_leftover_delimiters: True # bool
   remove_chunks: False # bool
+  chunking_mode: 'semantic' # str [fixed, semantic]
   chunk_size : 300 # int
   chunk_overlap : 30 # int
   chunk_separators : ["\n\n", "\n", " ", ""] # list of strings

code/modules/dataloader/data_loader.py CHANGED Viewed

@@ -14,6 +14,8 @@ from llama_parse import LlamaParse
 from langchain.schema import Document
 import logging
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from ragatouille import RAGPretrainedModel
 from langchain.chains import LLMChain
 from langchain_community.llms import OpenAI
@@ -63,12 +65,11 @@ class HTMLReader:
                 href = href.replace("http", "https")
             absolute_url = urljoin(base_url, href)
-            link["href"] = absolute_url
             resp = requests.head(absolute_url)
             if resp.status_code != 200:
-                logger.warning(f"Link {absolute_url} is broken")
-                logger.warning(f"Status code: {resp.status_code}")
         return str(soup)
@@ -84,7 +85,6 @@ class HTMLReader:
         else:
             return None
 class FileReader:
     def __init__(self, logger, kind):
         self.logger = logger
@@ -96,9 +96,7 @@ class FileReader:
         else:
             self.pdf_reader = PDFReader()
         self.web_reader = HTMLReader()
-        self.logger.info(
-            f"Initialized FileReader with {kind} PDF reader and HTML reader"
-        )
     def extract_text_from_pdf(self, pdf_path):
         text = ""
@@ -156,21 +154,31 @@ class ChunkProcessor:
         self.document_metadata = {}
         self.document_chunks_full = []
         if config["splitter_options"]["use_splitter"]:
-            if config["splitter_options"]["split_by_token"]:
-                self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-                    chunk_size=config["splitter_options"]["chunk_size"],
-                    chunk_overlap=config["splitter_options"]["chunk_overlap"],
-                    separators=config["splitter_options"]["chunk_separators"],
-                    disallowed_special=(),
-                )
             else:
-                self.splitter = RecursiveCharacterTextSplitter(
-                    chunk_size=config["splitter_options"]["chunk_size"],
-                    chunk_overlap=config["splitter_options"]["chunk_overlap"],
-                    separators=config["splitter_options"]["chunk_separators"],
-                    disallowed_special=(),
                 )
         else:
             self.splitter = None
         self.logger.info("ChunkProcessor instance created")
@@ -193,16 +201,12 @@ class ChunkProcessor:
     def process_chunks(
         self, documents, file_type="txt", source="", page=0, metadata={}
     ):
         documents = [Document(page_content=documents, source=source, page=page)]
-        if (
-            file_type == "txt"
-            or file_type == "docx"
-            or file_type == "srt"
-            or file_type == "tex"
-        ):
             document_chunks = self.splitter.split_documents(documents)
-        elif file_type == "pdf":
-            document_chunks = documents  # Full page for now
         # add the source and page number back to the metadata
         for chunk in document_chunks:
@@ -296,9 +300,6 @@ class ChunkProcessor:
     def process_file(self, file_path, file_index, file_reader, addl_metadata):
         file_name = os.path.basename(file_path)
-        if file_name in self.document_data:
-            return
         file_type = file_name.split(".")[-1]
         read_methods = {
@@ -313,7 +314,12 @@ class ChunkProcessor:
             return
         try:
-            documents = read_methods[file_type](file_path)
             self.process_documents(
                 documents, file_path, file_type, "file", addl_metadata
@@ -372,13 +378,14 @@ class ChunkProcessor:
             f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
         ) as json_file:
             self.document_metadata = json.load(json_file)
 class DataLoader:
     def __init__(self, config, logger=None):
-        self.file_reader = FileReader(
-            logger=logger, kind=config["llm_params"]["pdf_reader"]
-        )
         self.chunk_processor = ChunkProcessor(config, logger=logger)
     def get_chunks(self, uploaded_files, weblinks):
@@ -396,22 +403,19 @@ if __name__ == "__main__":
     with open("../code/modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
-    STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
     uploaded_files = [
-        os.path.join(STORAGE_DIR, file)
-        for file in os.listdir(STORAGE_DIR)
-        if file != "urls.txt"
     ]
     data_loader = DataLoader(config, logger=logger)
     document_chunks, document_names, documents, document_metadata = (
         data_loader.get_chunks(
-            [
-                "https://dl4ds.github.io/sp2024/static_files/discussion_slides/00_discussion.pdf"
-            ],
             [],
         )
     )
     print(document_names[:5])
     print(len(document_chunks))

 from langchain.schema import Document
 import logging
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_openai.embeddings import OpenAIEmbeddings
 from ragatouille import RAGPretrainedModel
 from langchain.chains import LLMChain
 from langchain_community.llms import OpenAI
                 href = href.replace("http", "https")
             absolute_url = urljoin(base_url, href)
+            link['href'] = absolute_url
             resp = requests.head(absolute_url)
             if resp.status_code != 200:
+                logger.warning(f"Link {absolute_url} is broken. Status code: {resp.status_code}")
         return str(soup)
         else:
             return None
 class FileReader:
     def __init__(self, logger, kind):
         self.logger = logger
         else:
             self.pdf_reader = PDFReader()
         self.web_reader = HTMLReader()
+        self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
     def extract_text_from_pdf(self, pdf_path):
         text = ""
         self.document_metadata = {}
         self.document_chunks_full = []
+        if not config['vectorstore']['embedd_files']:
+            self.load_document_data()
         if config["splitter_options"]["use_splitter"]:
+            if config["splitter_options"]["chunking_mode"] == "fixed":
+                if config["splitter_options"]["split_by_token"]:
+                    self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+                        chunk_size=config["splitter_options"]["chunk_size"],
+                        chunk_overlap=config["splitter_options"]["chunk_overlap"],
+                        separators=config["splitter_options"]["chunk_separators"],
+                        disallowed_special=(),
+                    )
+                else:
+                    self.splitter = RecursiveCharacterTextSplitter(
+                        chunk_size=config["splitter_options"]["chunk_size"],
+                        chunk_overlap=config["splitter_options"]["chunk_overlap"],
+                        separators=config["splitter_options"]["chunk_separators"],
+                        disallowed_special=(),
+                    )
             else:
+                self.splitter = SemanticChunker(
+                    OpenAIEmbeddings(),
+                    breakpoint_threshold_type="percentile"
                 )
         else:
             self.splitter = None
         self.logger.info("ChunkProcessor instance created")
     def process_chunks(
         self, documents, file_type="txt", source="", page=0, metadata={}
     ):
+        # TODO: Clear up this pipeline of re-adding metadata
         documents = [Document(page_content=documents, source=source, page=page)]
+        if file_type == "pdf" and self.config["splitter_options"]["chunking_mode"] == "fixed":
+            document_chunks = documents
+        else:
             document_chunks = self.splitter.split_documents(documents)
         # add the source and page number back to the metadata
         for chunk in document_chunks:
     def process_file(self, file_path, file_index, file_reader, addl_metadata):
         file_name = os.path.basename(file_path)
         file_type = file_name.split(".")[-1]
         read_methods = {
             return
         try:
+            if file_path in self.document_data:
+                self.logger.warning(f"File {file_name} already processed")
+                documents = [Document(page_content=content) for content in self.document_data[file_path].values()]
+            else:
+                documents = read_methods[file_type](file_path)
             self.process_documents(
                 documents, file_path, file_type, "file", addl_metadata
             f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
         ) as json_file:
             self.document_metadata = json.load(json_file)
+        self.logger.info(
+            f"Loaded document content from {self.config['log_chunk_dir']}/docs/doc_content.json. Total documents: {len(self.document_data)}"
+        )
 class DataLoader:
     def __init__(self, config, logger=None):
+        self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
         self.chunk_processor = ChunkProcessor(config, logger=logger)
     def get_chunks(self, uploaded_files, weblinks):
     with open("../code/modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
+    STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
     uploaded_files = [
+        os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
     ]
     data_loader = DataLoader(config, logger=logger)
     document_chunks, document_names, documents, document_metadata = (
         data_loader.get_chunks(
+            ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
             [],
         )
     )
     print(document_names[:5])
     print(len(document_chunks))

code/modules/dataloader/pdf_readers/gpt.py CHANGED Viewed

@@ -23,7 +23,7 @@ class GPTParser:
          The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
          The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
          For images, give a description and if you can, a source. Separate each page with '---'.
-         Just respond with the markdown.
          """
     def parse(self, pdf_path):

          The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
          The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
          For images, give a description and if you can, a source. Separate each page with '---'.
+         Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
          """
     def parse(self, pdf_path):

code/modules/vectorstore/faiss.py CHANGED Viewed

@@ -14,6 +14,10 @@ class FaissVectorStore(VectorStoreBase):
     def __init__(self, config):
         self.config = config
         self._init_vector_db()
     def _init_vector_db(self):
         self.faiss = FAISS(
@@ -25,24 +29,12 @@ class FaissVectorStore(VectorStoreBase):
             documents=document_chunks, embedding=embedding_model
         )
         self.vectorstore.save_local(
-            os.path.join(
-                self.config["vectorstore"]["db_path"],
-                "db_"
-                + self.config["vectorstore"]["db_option"]
-                + "_"
-                + self.config["vectorstore"]["model"],
-            )
         )
     def load_database(self, embedding_model):
         self.vectorstore = self.faiss.load_local(
-            os.path.join(
-                self.config["vectorstore"]["db_path"],
-                "db_"
-                + self.config["vectorstore"]["db_option"]
-                + "_"
-                + self.config["vectorstore"]["model"],
-            ),
             embedding_model,
             allow_dangerous_deserialization=True,
         )

     def __init__(self, config):
         self.config = config
         self._init_vector_db()
+        self.local_path = os.path.join(self.config["vectorstore"]["db_path"],
+                                       "db_" + self.config["vectorstore"]["db_option"]
+                                       + "_" + self.config["vectorstore"]["model"]
+                                       + "_" + config["splitter_options"]["chunking_mode"])
     def _init_vector_db(self):
         self.faiss = FAISS(
             documents=document_chunks, embedding=embedding_model
         )
         self.vectorstore.save_local(
+            self.local_path
         )
     def load_database(self, embedding_model):
         self.vectorstore = self.faiss.load_local(
+            self.local_path,
             embedding_model,
             allow_dangerous_deserialization=True,
         )