Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

Farid Karimli commited on Jul 25

Commit

638bffe

•

1 Parent(s): 9a544d2

LLaMa parser fix

Browse files

Files changed (6) hide show

code/.chainlit/config.toml +19 -16
code/main.py +2 -0
code/modules/config/config.yml +2 -2
code/modules/config/constants.py +6 -3
code/modules/dataloader/data_loader.py +29 -31
code/modules/dataloader/helpers.py +22 -2

code/.chainlit/config.toml CHANGED Viewed

@@ -23,7 +23,7 @@ allow_origins = ["*"]
 unsafe_allow_html = false
 # Process and display mathematical expressions. This can clash with "$" characters in messages.
-latex = false
 # Automatically tag threads with the current chat profile (if a chat profile is used)
 auto_tag_thread = true
@@ -85,31 +85,34 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
 # custom_build = "./public/build"
 [UI.theme]
-    default = "light"
     #layout = "wide"
     #font_family = "Inter, sans-serif"
 # Override default MUI light theme. (Check theme.ts)
 [UI.theme.light]
-    background = "#FAFAFA"
-    paper = "#FFFFFF"
     [UI.theme.light.primary]
-        main = "#b22222"  # Brighter shade of red
-        dark = "#8b0000"  # Darker shade of the brighter red
-        light = "#ff6347"  # Lighter shade of the brighter red
     [UI.theme.light.text]
-        primary = "#212121"
-        secondary = "#616161"
 # Override default MUI dark theme. (Check theme.ts)
 [UI.theme.dark]
-    background = "#1C1C1C" # Slightly lighter dark background color
-    paper = "#2A2A2A"      # Slightly lighter dark paper color
     [UI.theme.dark.primary]
-        main = "#89CFF0"    # Primary color
-        dark = "#3700B3"    # Dark variant of primary color
-        light = "#CFBCFF"   # Lighter variant of primary color
 [meta]
-generated_by = "1.1.302"

 unsafe_allow_html = false
 # Process and display mathematical expressions. This can clash with "$" characters in messages.
+latex = true
 # Automatically tag threads with the current chat profile (if a chat profile is used)
 auto_tag_thread = true
 # custom_build = "./public/build"
 [UI.theme]
+    default = "dark"
     #layout = "wide"
     #font_family = "Inter, sans-serif"
 # Override default MUI light theme. (Check theme.ts)
 [UI.theme.light]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
     [UI.theme.light.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
     [UI.theme.light.text]
+        #primary = "#212121"
+        #secondary = "#616161"
 # Override default MUI dark theme. (Check theme.ts)
 [UI.theme.dark]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
     [UI.theme.dark.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+    [UI.theme.dark.text]
+        #primary = "#EEEEEE"
+        #secondary = "#BDBDBD"
 [meta]
+generated_by = "1.1.304"

code/main.py CHANGED Viewed

@@ -173,4 +173,6 @@ async def main(message):
     answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
     processor._process(message.content, answer, sources_dict)
     await cl.Message(content=answer_with_sources, elements=source_elements).send()

     answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
     processor._process(message.content, answer, sources_dict)
+    answer_with_sources = answer_with_sources.replace("$$", "$")
     await cl.Message(content=answer_with_sources, elements=source_elements).send()

code/modules/config/config.yml CHANGED Viewed

@@ -7,7 +7,7 @@ vectorstore:
   data_path: '../storage/data' # str
   url_file_path: '../storage/data/urls.txt' # str
   expand_urls: True # bool
-  db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille, RAPTOR]
   db_path : '../vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
   search_top_k : 3 # int
@@ -32,7 +32,7 @@ llm_params:
   local_llm_params:
     model: 'tiny-llama'
     temperature: 0.7
-  pdf_reader: 'llama' # str [llama, pymupdf]
 chat_logging:
   log_chat: False # bool

   data_path: '../storage/data' # str
   url_file_path: '../storage/data/urls.txt' # str
   expand_urls: True # bool
+  db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille, RAPTOR]
   db_path : '../vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
   search_top_k : 3 # int
   local_llm_params:
     model: 'tiny-llama'
     temperature: 0.7
+  pdf_reader: 'llama' # str [llama, pymupdf, gpt]
 chat_logging:
   log_chat: False # bool

code/modules/config/constants.py CHANGED Viewed

@@ -15,7 +15,9 @@ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me question
 # Prompt Templates
 openai_prompt_template = """Use the following pieces of information to answer the user's question.
-You are an intelligent chatbot designed to help students with questions regarding the course. Render math equations in LaTeX format between $$ signs, and explain the parameters and variables in the equations.
 If you don't know the answer, just say that you don't know.
 Context: {context}
@@ -26,8 +28,9 @@ Helpful answer:
 """
 openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
-You are an intelligent chatbot designed to help students with questions regarding the course. Render math equations in LaTeX format between $$ signs.
 If you don't know the answer, just say that you don't know, don't try to make up an answer.
 Use the history to answer the question if you can.

 # Prompt Templates
 openai_prompt_template = """Use the following pieces of information to answer the user's question.
+You are an intelligent chatbot designed to help students with questions regarding the course.
+Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
+Be sure to explain the parameters and variables in the equations.
 If you don't know the answer, just say that you don't know.
 Context: {context}
 """
 openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
+You are an intelligent chatbot designed to help students with questions regarding the course.
+Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
+Be sure to explain the parameters and variables in the equations.
 If you don't know the answer, just say that you don't know, don't try to make up an answer.
 Use the history to answer the question if you can.

code/modules/dataloader/data_loader.py CHANGED Viewed

@@ -27,12 +27,10 @@ import tempfile
 import PyPDF2
 try:
-    from modules.dataloader.helpers import get_metadata
     from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
 except:
-    from dataloader.helpers import get_metadata
     from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
 logger = logging.getLogger(__name__)
@@ -51,6 +49,7 @@ class PDFReader:
 class LlamaParser:
     def __init__(self):
         self.GPT_API_KEY = OPENAI_API_KEY
         self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
         self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
@@ -65,16 +64,30 @@ class LlamaParser:
             language="en",
             gpt4o_mode=False,
             # gpt4o_api_key=OPENAI_API_KEY,
-            parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source."
         )
     def parse(self, pdf_path):
         pdf_name = os.path.basename(pdf_path)
         documents = self.parser.load_data(pdf_path)
-        documents = [document.to_langchain_format() for document in documents]
-        os.remove(pdf_path) # cleanup, just in case
         return documents
     def make_request(self, pdf_url):
@@ -186,18 +199,6 @@ class FileReader:
                 text += page.extract_text()
         return text
-    @staticmethod
-    def download_pdf_from_url(pdf_url):
-        response = requests.get(pdf_url)
-        if response.status_code == 200:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
-                temp_file.write(response.content)
-                temp_file_path = temp_file.name
-            return temp_file_path
-        else:
-            self.logger.error(f"Failed to download PDF from URL: {pdf_url}")
-            return None
     def read_pdf(self, temp_file_path: str):
         if self.kind == "llama":
             documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
@@ -383,22 +384,17 @@ class ChunkProcessor:
                 )
                 self.document_chunks_full.extend(document_chunks)
         self.document_data[file_path] = file_data
         self.document_metadata[file_path] = file_metadata
     def process_file(self, file_path, file_index, file_reader, addl_metadata):
         file_name = os.path.basename(file_path)
-        storage_dir = os.path.join(os.getcwd(), self.config["vectorstore"]["data_path"])
-        local_path = os.path.join(storage_dir, file_name)
-        if not os.path.exists(local_path):
-            local_path = FileReader.download_pdf_from_url(pdf_url=file_path)
         if file_name in self.document_data:
             return
-        file_type = file_name.split(".")[-1].lower()
-        self.logger.info(f"Reading file {file_index + 1}: {local_path}")
         read_methods = {
             "pdf": file_reader.read_pdf,
@@ -412,9 +408,10 @@ class ChunkProcessor:
             return
         try:
-            documents = read_methods[file_type](local_path)
             self.process_documents(
-                documents, local_path, file_type, "file", addl_metadata
             )
         except Exception as e:
             self.logger.error(f"Error processing file {file_name}: {str(e)}")
@@ -500,10 +497,11 @@ if __name__ == "__main__":
     data_loader = DataLoader(config, logger=logger)
     document_chunks, document_names, documents, document_metadata = (
         data_loader.get_chunks(
-            uploaded_files,
-            ["https://dl4ds.github.io/sp2024/"],
         )
     )
-    print(document_names)
     print(len(document_chunks))

 import PyPDF2
 try:
+    from modules.dataloader.helpers import get_metadata, download_pdf_from_url
     from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
 except:
+    from dataloader.helpers import get_metadata, download_pdf_from_url
     from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
 logger = logging.getLogger(__name__)
 class LlamaParser:
     def __init__(self):
+        print("Initializing LlamaParser")
         self.GPT_API_KEY = OPENAI_API_KEY
         self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
         self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
             language="en",
             gpt4o_mode=False,
             # gpt4o_api_key=OPENAI_API_KEY,
+            parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
         )
     def parse(self, pdf_path):
         pdf_name = os.path.basename(pdf_path)
+        if not os.path.exists(pdf_path):
+            logger.warning(f"File {pdf_name} does not exist locally, installing temporarily...")
+            pdf_path = download_pdf_from_url(pdf_path)
         documents = self.parser.load_data(pdf_path)
+        document = [document.to_langchain_format() for document in documents][0]
+        content = document.page_content
+        pages = content.split("\n---\n")
+        pages = [page.strip() for page in pages]
+        documents = [
+            Document(
+                page_content=page,
+                metadata={"source": pdf_path, "page": i}
+            ) for i, page in enumerate(pages)
+        ]
         return documents
     def make_request(self, pdf_url):
                 text += page.extract_text()
         return text
     def read_pdf(self, temp_file_path: str):
         if self.kind == "llama":
             documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
                 )
                 self.document_chunks_full.extend(document_chunks)
+        print(f"Processed {file_path}. File_data: {file_data}")
         self.document_data[file_path] = file_data
         self.document_metadata[file_path] = file_metadata
     def process_file(self, file_path, file_index, file_reader, addl_metadata):
         file_name = os.path.basename(file_path)
         if file_name in self.document_data:
             return
+        file_type = file_name.split(".")[-1]
         read_methods = {
             "pdf": file_reader.read_pdf,
             return
         try:
+            documents = read_methods[file_type](file_path)
             self.process_documents(
+                documents, file_path, file_type, "file", addl_metadata
             )
         except Exception as e:
             self.logger.error(f"Error processing file {file_name}: {str(e)}")
     data_loader = DataLoader(config, logger=logger)
     document_chunks, document_names, documents, document_metadata = (
         data_loader.get_chunks(
+            ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
+            [],
         )
     )
+    print(document_names[:5])
     print(len(document_chunks))

code/modules/dataloader/helpers.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import requests
 from bs4 import BeautifulSoup
-from tqdm import tqdm
 def get_urls_from_file(file_path: str):
     """
@@ -106,3 +106,23 @@ def get_metadata(lectures_url, schedule_url):
             continue
     return lecture_metadata

 import requests
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+import tempfile
 def get_urls_from_file(file_path: str):
     """
             continue
     return lecture_metadata
+def download_pdf_from_url(pdf_url):
+    """
+    Function to temporarily download a PDF file from a URL and return the local file path.
+    Args:
+        pdf_url (str): The URL of the PDF file to download.
+    Returns:
+        str: The local file path of the downloaded PDF file.
+    """
+    response = requests.get(pdf_url)
+    if response.status_code == 200:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file.write(response.content)
+            temp_file_path = temp_file.name
+        return temp_file_path
+    else:
+        return None