Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

Thomas (Tom) Gardos commited on Aug 7

Commit

36e2567

•

2 Parent(s): 8b05bb6 b2b3fe2

Merge pull request #58 from DL4DS/setup_and_format_instructions

Browse files

Files changed (31) hide show

.flake8 +3 -0
.github/workflows/code_quality_check.yml +33 -0
.gitignore +1 -0
README.md +5 -2
code/.chainlit/config.toml +3 -1
code/__init__.py +0 -1
code/main.py +24 -12
code/modules/chat/chat_model_loader.py +1 -8
code/modules/chat/langchain/__init__.py +0 -0
code/modules/chat/langchain/langchain_rag.py +13 -8
code/modules/chat/langchain/utils.py +3 -27
code/modules/chat/llm_tutor.py +10 -7
code/modules/chat_processor/literal_ai.py +1 -38
code/modules/config/config.yml +1 -2
code/modules/config/constants.py +6 -3
code/modules/config/user_config.yml +3 -0
code/modules/dataloader/data_loader.py +75 -49
code/modules/dataloader/helpers.py +5 -3
code/modules/dataloader/pdf_readers/gpt.py +27 -19
code/modules/dataloader/pdf_readers/llama.py +24 -23
code/modules/dataloader/webpage_crawler.py +5 -3
code/modules/vectorstore/colbert.py +3 -2
code/modules/vectorstore/embedding_model_loader.py +1 -7
code/modules/vectorstore/faiss.py +10 -7
code/modules/vectorstore/raptor.py +1 -4
code/modules/vectorstore/store_manager.py +13 -7
docs/README.md +0 -51
docs/contribute.md +33 -0
docs/setup.md +127 -0
pyproject.toml +2 -0
requirements.txt +3 -0

.flake8 ADDED Viewed

	@@ -0,0 +1,3 @@

+[flake8]
+max-line-length = 88
+extend-ignore = E203, E266, E501, W503

.github/workflows/code_quality_check.yml ADDED Viewed

	@@ -0,0 +1,33 @@

+name: Code Quality and Security Checks
+on:
+  push:
+    branches: [ main, dev_branch ]
+  pull_request:
+    branches: [ main, dev_branch ]
+jobs:
+  code-quality:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 black bandit
+    - name: Run Black
+      run: black --check .
+    - name: Run Flake8
+      run: flake8 .
+    - name: Run Bandit
+      run: |
+        bandit -r .

.gitignore CHANGED Viewed

@@ -165,6 +165,7 @@ cython_debug/
 .ragatouille/*
 */__pycache__/*
 .chainlit/translations/
 storage/logs/*
 vectorstores/*

 .ragatouille/*
 */__pycache__/*
 .chainlit/translations/
+code/.chainlit/translations/
 storage/logs/*
 vectorstores/*

README.md CHANGED Viewed

@@ -15,6 +15,8 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
 ## Running Locally
 1. **Clone the Repository**
    ```bash
    git clone https://github.com/DL4DS/dl4ds_tutor
@@ -36,7 +38,6 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
    python -m modules.vectorstore.store_manager
    ```
    - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
-   - Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command.
 5. **Run the Chainlit App**
    ```bash
@@ -90,4 +91,6 @@ docker run -it --rm -p 8000:8000 dev
 ## Contributing
-Please create an issue if you have any suggestions or improvements, and start working on it by creating a branch and by making a pull request to the main branch.

 ## Running Locally
+Please view `docs/setup.md` for more information on setting up the project.
 1. **Clone the Repository**
    ```bash
    git clone https://github.com/DL4DS/dl4ds_tutor
    python -m modules.vectorstore.store_manager
    ```
    - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
 5. **Run the Chainlit App**
    ```bash
 ## Contributing
+Please create an issue if you have any suggestions or improvements, and start working on it by creating a branch and by making a pull request to the main branch.
+Please view `docs/contribute.md` for more information on contributing.

code/.chainlit/config.toml CHANGED Viewed

@@ -49,6 +49,8 @@ auto_tag_thread = true
     # Sample rate of the audio
     sample_rate = 44100
 [UI]
 # Name of the assistant.
 name = "AI Tutor"
@@ -115,4 +117,4 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
         #secondary = "#BDBDBD"
 [meta]
-generated_by = "1.1.304"

     # Sample rate of the audio
     sample_rate = 44100
+edit_message = true
 [UI]
 # Name of the assistant.
 name = "AI Tutor"
         #secondary = "#BDBDBD"
 [meta]
+generated_by = "1.1.306"

code/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .modules import *

code/main.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import chainlit.data as cl_data
 import asyncio
 from modules.config.constants import (
-    LLAMA_PATH,
     LITERAL_API_KEY_LOGGING,
     LITERAL_API_URL,
 )
@@ -9,7 +8,6 @@ from modules.chat_processor.literal_ai import CustomLiteralDataLayer
 import json
 import yaml
-import os
 from typing import Any, Dict, no_type_check
 import chainlit as cl
 from modules.chat.llm_tutor import LLMTutor
@@ -73,7 +71,14 @@ class Chatbot:
         start_time = time.time()
         llm_settings = cl.user_session.get("llm_settings", {})
-        chat_profile, retriever_method, memory_window, llm_style, generate_follow_up, chunking_mode = (
             llm_settings.get("chat_model"),
             llm_settings.get("retriever_method"),
             llm_settings.get("memory_window"),
@@ -113,8 +118,6 @@ class Chatbot:
             ),
         )
-        tags = [chat_profile, self.config["vectorstore"]["db_option"]]
         cl.user_session.set("chain", self.chain)
         cl.user_session.set("llm_tutor", self.llm_tutor)
@@ -180,7 +183,7 @@ class Chatbot:
                 cl.input_widget.Select(
                     id="chunking_mode",
                     label="Chunking mode",
-                    values=['fixed', 'semantic'],
                     initial_index=1,
                 ),
                 cl.input_widget.Switch(
@@ -241,7 +244,8 @@ class Chatbot:
             )  # see if the thread has any steps
             if thread.steps or len(thread.steps) > 0:
                 return None
-        except:
             return [
                 cl.Starter(
                     label="recording on CNNs?",
@@ -294,10 +298,18 @@ class Chatbot:
         await self.make_llm_settings_widgets(self.config)
         user = cl.user_session.get("user")
-        self.user = {
-            "user_id": user.identifier,
-            "session_id": cl.context.session.thread_id,
-        }
         memory = cl.user_session.get("memory", [])
@@ -355,7 +367,7 @@ class Chatbot:
         llm_settings = cl.user_session.get("llm_settings", {})
         view_sources = llm_settings.get("view_sources", False)
         stream = llm_settings.get("stream_response", False)
-        steam = False  # Fix streaming
         user_query_dict = {"input": message.content}
         # Define the base configuration
         chain_config = {

 import chainlit.data as cl_data
 import asyncio
 from modules.config.constants import (
     LITERAL_API_KEY_LOGGING,
     LITERAL_API_URL,
 )
 import json
 import yaml
 from typing import Any, Dict, no_type_check
 import chainlit as cl
 from modules.chat.llm_tutor import LLMTutor
         start_time = time.time()
         llm_settings = cl.user_session.get("llm_settings", {})
+        (
+            chat_profile,
+            retriever_method,
+            memory_window,
+            llm_style,
+            generate_follow_up,
+            chunking_mode,
+        ) = (
             llm_settings.get("chat_model"),
             llm_settings.get("retriever_method"),
             llm_settings.get("memory_window"),
             ),
         )
         cl.user_session.set("chain", self.chain)
         cl.user_session.set("llm_tutor", self.llm_tutor)
                 cl.input_widget.Select(
                     id="chunking_mode",
                     label="Chunking mode",
+                    values=["fixed", "semantic"],
                     initial_index=1,
                 ),
                 cl.input_widget.Switch(
             )  # see if the thread has any steps
             if thread.steps or len(thread.steps) > 0:
                 return None
+        except Exception as e:
+            print(e)
             return [
                 cl.Starter(
                     label="recording on CNNs?",
         await self.make_llm_settings_widgets(self.config)
         user = cl.user_session.get("user")
+        try:
+            self.user = {
+                "user_id": user.identifier,
+                "session_id": cl.context.session.thread_id,
+            }
+        except Exception as e:
+            print(e)
+            self.user = {
+                "user_id": "guest",
+                "session_id": cl.context.session.thread_id,
+            }
         memory = cl.user_session.get("memory", [])
         llm_settings = cl.user_session.get("llm_settings", {})
         view_sources = llm_settings.get("view_sources", False)
         stream = llm_settings.get("stream_response", False)
+        stream = False  # Fix streaming
         user_query_dict = {"input": message.content}
         # Define the base configuration
         chain_config = {

code/modules/chat/chat_model_loader.py CHANGED Viewed

@@ -1,15 +1,8 @@
 from langchain_openai import ChatOpenAI
-from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
-from transformers import AutoTokenizer, TextStreamer
 from langchain_community.llms import LlamaCpp
-import torch
-import transformers
 import os
 from pathlib import Path
 from huggingface_hub import hf_hub_download
-from langchain.callbacks.manager import CallbackManager
-from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-from modules.config.constants import LLAMA_PATH
 class ChatModelLoader:
@@ -38,7 +31,7 @@ class ChatModelLoader:
                 self.config["llm_params"]["local_llm_params"]["model"]
             )
             llm = LlamaCpp(
-                model_path=LLAMA_PATH,
                 n_batch=n_batch,
                 n_ctx=2048,
                 f16_kv=True,

 from langchain_openai import ChatOpenAI
 from langchain_community.llms import LlamaCpp
 import os
 from pathlib import Path
 from huggingface_hub import hf_hub_download
 class ChatModelLoader:
                 self.config["llm_params"]["local_llm_params"]["model"]
             )
             llm = LlamaCpp(
+                model_path=model_path,
                 n_batch=n_batch,
                 n_ctx=2048,
                 f16_kv=True,

code/modules/chat/langchain/__init__.py ADDED Viewed

File without changes

code/modules/chat/langchain/langchain_rag.py CHANGED Viewed

@@ -1,17 +1,22 @@
 from langchain_core.prompts import ChatPromptTemplate
-from modules.chat.langchain.utils import *
-from langchain.memory import ChatMessageHistory
 from modules.chat.base import BaseRAG
 from langchain_core.prompts import PromptTemplate
-from langchain.memory import (
-    ConversationBufferWindowMemory,
-    ConversationSummaryBufferMemory,
 )
-import chainlit as cl
-from langchain_community.chat_models import ChatOpenAI
 class Langchain_RAG_V1(BaseRAG):

 from langchain_core.prompts import ChatPromptTemplate
+# from modules.chat.langchain.utils import
+from langchain_community.chat_message_histories import ChatMessageHistory
 from modules.chat.base import BaseRAG
 from langchain_core.prompts import PromptTemplate
+from langchain.memory import ConversationBufferWindowMemory
+from langchain_core.runnables.utils import ConfigurableFieldSpec
+from .utils import (
+    CustomConversationalRetrievalChain,
+    create_history_aware_retriever,
+    create_stuff_documents_chain,
+    create_retrieval_chain,
+    return_questions,
+    CustomRunnableWithHistory,
+    BaseChatMessageHistory,
+    InMemoryHistory,
 )
 class Langchain_RAG_V1(BaseRAG):

code/modules/chat/langchain/utils.py CHANGED Viewed

@@ -1,53 +1,29 @@
 from typing import Any, Dict, List, Union, Tuple, Optional
-from langchain_core.messages import (
-    BaseMessage,
-    AIMessage,
-    FunctionMessage,
-    HumanMessage,
-)
 from langchain_core.prompts.base import BasePromptTemplate, format_document
-from langchain_core.prompts.chat import MessagesPlaceholder
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.output_parsers.base import BaseOutputParser
 from langchain_core.retrievers import BaseRetriever, RetrieverOutput
 from langchain_core.language_models import LanguageModelLike
 from langchain_core.runnables import Runnable, RunnableBranch, RunnablePassthrough
 from langchain_core.runnables.history import RunnableWithMessageHistory
-from langchain_core.runnables.utils import ConfigurableFieldSpec
 from langchain_core.chat_history import BaseChatMessageHistory
 from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain.chains.combine_documents.base import (
     DEFAULT_DOCUMENT_PROMPT,
     DEFAULT_DOCUMENT_SEPARATOR,
     DOCUMENTS_KEY,
-    BaseCombineDocumentsChain,
     _validate_prompt,
 )
-from langchain.chains.llm import LLMChain
-from langchain_core.callbacks import Callbacks
-from langchain_core.documents import Document
-CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
 from langchain_core.runnables.config import RunnableConfig
-from langchain_core.messages import BaseMessage
-from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.chat_models import ChatOpenAI
-from langchain.chains import RetrievalQA, ConversationalRetrievalChain
-from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
 import inspect
-from langchain.chains.conversational_retrieval.base import _get_chat_history
 from langchain_core.messages import BaseMessage
 class CustomConversationalRetrievalChain(ConversationalRetrievalChain):

 from typing import Any, Dict, List, Union, Tuple, Optional
 from langchain_core.prompts.base import BasePromptTemplate, format_document
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.output_parsers.base import BaseOutputParser
 from langchain_core.retrievers import BaseRetriever, RetrieverOutput
 from langchain_core.language_models import LanguageModelLike
 from langchain_core.runnables import Runnable, RunnableBranch, RunnablePassthrough
 from langchain_core.runnables.history import RunnableWithMessageHistory
 from langchain_core.chat_history import BaseChatMessageHistory
 from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain.chains.combine_documents.base import (
     DEFAULT_DOCUMENT_PROMPT,
     DEFAULT_DOCUMENT_SEPARATOR,
     DOCUMENTS_KEY,
     _validate_prompt,
 )
 from langchain_core.runnables.config import RunnableConfig
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.chat_models import ChatOpenAI
+from langchain.chains import ConversationalRetrievalChain
 from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
 import inspect
 from langchain_core.messages import BaseMessage
+CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
 class CustomConversationalRetrievalChain(ConversationalRetrievalChain):

code/modules/chat/llm_tutor.py CHANGED Viewed

@@ -3,7 +3,6 @@ from modules.chat.chat_model_loader import ChatModelLoader
 from modules.vectorstore.store_manager import VectorStoreManager
 from modules.retriever.retriever import Retriever
 from modules.chat.langchain.langchain_rag import (
-    Langchain_RAG_V1,
     Langchain_RAG_V2,
     QuestionGenerator,
 )
@@ -28,9 +27,11 @@ class LLMTutor:
         self.rephrase_prompt = get_prompt(
             config, "rephrase"
         )  # Initialize rephrase_prompt
-        if self.config["vectorstore"]["embedd_files"]:
-            self.vector_db.create_database()
-            self.vector_db.save_database()
     def update_llm(self, old_config, new_config):
         """
@@ -48,9 +49,11 @@ class LLMTutor:
             self.vector_db = VectorStoreManager(
                 self.config, logger=self.logger
             ).load_database()  # Reinitialize VectorStoreManager if vectorstore changes
-            if self.config["vectorstore"]["embedd_files"]:
-                self.vector_db.create_database()
-                self.vector_db.save_database()
         if "llm_params.llm_style" in changes:
             self.qa_prompt = get_prompt(

 from modules.vectorstore.store_manager import VectorStoreManager
 from modules.retriever.retriever import Retriever
 from modules.chat.langchain.langchain_rag import (
     Langchain_RAG_V2,
     QuestionGenerator,
 )
         self.rephrase_prompt = get_prompt(
             config, "rephrase"
         )  # Initialize rephrase_prompt
+        # TODO: Removed this functionality for now, don't know if we need it
+        # if self.config["vectorstore"]["embedd_files"]:
+        #     self.vector_db.create_database()
+        #     self.vector_db.save_database()
     def update_llm(self, old_config, new_config):
         """
             self.vector_db = VectorStoreManager(
                 self.config, logger=self.logger
             ).load_database()  # Reinitialize VectorStoreManager if vectorstore changes
+            # TODO: Removed this functionality for now, don't know if we need it
+            # if self.config["vectorstore"]["embedd_files"]:
+            #     self.vector_db.create_database()
+            #     self.vector_db.save_database()
         if "llm_params.llm_style" in changes:
             self.qa_prompt = get_prompt(

code/modules/chat_processor/literal_ai.py CHANGED Viewed

@@ -1,44 +1,7 @@
-from chainlit.data import ChainlitDataLayer, queue_until_user_message
 # update custom methods here (Ref: https://github.com/Chainlit/chainlit/blob/4b533cd53173bcc24abe4341a7108f0070d60099/backend/chainlit/data/__init__.py)
 class CustomLiteralDataLayer(ChainlitDataLayer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-    @queue_until_user_message()
-    async def create_step(self, step_dict: "StepDict"):
-        metadata = dict(
-            step_dict.get("metadata", {}),
-            **{
-                "waitForAnswer": step_dict.get("waitForAnswer"),
-                "language": step_dict.get("language"),
-                "showInput": step_dict.get("showInput"),
-            },
-        )
-        step: LiteralStepDict = {
-            "createdAt": step_dict.get("createdAt"),
-            "startTime": step_dict.get("start"),
-            "endTime": step_dict.get("end"),
-            "generation": step_dict.get("generation"),
-            "id": step_dict.get("id"),
-            "parentId": step_dict.get("parentId"),
-            "name": step_dict.get("name"),
-            "threadId": step_dict.get("threadId"),
-            "type": step_dict.get("type"),
-            "tags": step_dict.get("tags"),
-            "metadata": metadata,
-        }
-        if step_dict.get("input"):
-            step["input"] = {"content": step_dict.get("input")}
-        if step_dict.get("output"):
-            step["output"] = {"content": step_dict.get("output")}
-        if step_dict.get("isError"):
-            step["error"] = step_dict.get("output")
-        # print("\n\n\n")
-        # print("Step: ", step)
-        # print("\n\n\n")
-        await self.client.api.send_steps([step])

+from chainlit.data import ChainlitDataLayer
 # update custom methods here (Ref: https://github.com/Chainlit/chainlit/blob/4b533cd53173bcc24abe4341a7108f0070d60099/backend/chainlit/data/__init__.py)
 class CustomLiteralDataLayer(ChainlitDataLayer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)

code/modules/config/config.yml CHANGED Viewed

@@ -4,7 +4,7 @@ device: 'cpu' # str [cuda, cpu]
 vectorstore:
   load_from_HF: True # bool
-  embedd_files: False # bool
   data_path: '../storage/data' # str
   url_file_path: '../storage/data/urls.txt' # str
   expand_urls: True # bool
@@ -37,7 +37,6 @@ llm_params:
     temperature: 0.7 # float
     repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
     filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
-  pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
   stream: False # bool
   pdf_reader: 'gpt' # str [llama, pymupdf, gpt]

 vectorstore:
   load_from_HF: True # bool
+  reparse_files: True # bool
   data_path: '../storage/data' # str
   url_file_path: '../storage/data/urls.txt' # str
   expand_urls: True # bool
     temperature: 0.7 # float
     repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
     filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
   stream: False # bool
   pdf_reader: 'gpt' # str [llama, pymupdf, gpt]

code/modules/config/constants.py CHANGED Viewed

@@ -3,6 +3,8 @@ import os
 load_dotenv()
 # API Keys - Loaded from the .env file
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
@@ -14,10 +16,11 @@ LITERAL_API_URL = os.getenv("LITERAL_API_URL")
 OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID")
 OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET")
-opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
 # Model Paths
 LLAMA_PATH = "../storage/models/tinyllama"
-RETRIEVER_HF_PATHS = {"RAGatouille": "XThomasBU/Colbert_Index"}

 load_dotenv()
+TIMEOUT = 60
 # API Keys - Loaded from the .env file
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID")
 OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET")
+opening_message = "Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
+chat_end_message = (
+    "I hope I was able to help you. If you have any more questions, feel free to ask!"
+)
 # Model Paths
 LLAMA_PATH = "../storage/models/tinyllama"

code/modules/config/user_config.yml ADDED Viewed

	@@ -0,0 +1,3 @@

+retriever:
+  retriever_hf_paths:
+    RAGatouille: "XThomasBU/Colbert_Index"

code/modules/dataloader/data_loader.py CHANGED Viewed

@@ -3,40 +3,26 @@ import re
 import requests
 import pysrt
 from langchain_community.document_loaders import (
-    PyMuPDFLoader,
     Docx2txtLoader,
     YoutubeLoader,
-    WebBaseLoader,
     TextLoader,
 )
-from langchain_community.document_loaders import UnstructuredMarkdownLoader
-from llama_parse import LlamaParse
 from langchain.schema import Document
 import logging
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_experimental.text_splitter import SemanticChunker
 from langchain_openai.embeddings import OpenAIEmbeddings
-from ragatouille import RAGPretrainedModel
-from langchain.chains import LLMChain
-from langchain_community.llms import OpenAI
-from langchain import PromptTemplate
 import json
 from concurrent.futures import ThreadPoolExecutor
 from urllib.parse import urljoin
 import html2text
 import bs4
-import tempfile
 import PyPDF2
 from modules.dataloader.pdf_readers.base import PDFReader
 from modules.dataloader.pdf_readers.llama import LlamaParser
 from modules.dataloader.pdf_readers.gpt import GPTParser
-try:
-    from modules.dataloader.helpers import get_metadata, download_pdf_from_url
-    from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
-except:
-    from dataloader.helpers import get_metadata, download_pdf_from_url
-    from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
 logger = logging.getLogger(__name__)
 BASE_DIR = os.getcwd()
@@ -47,7 +33,7 @@ class HTMLReader:
         pass
     def read_url(self, url):
-        response = requests.get(url)
         if response.status_code == 200:
             return response.text
         else:
@@ -65,11 +51,13 @@ class HTMLReader:
                 href = href.replace("http", "https")
             absolute_url = urljoin(base_url, href)
-            link['href'] = absolute_url
-            resp = requests.head(absolute_url)
             if resp.status_code != 200:
-                logger.warning(f"Link {absolute_url} is broken. Status code: {resp.status_code}")
         return str(soup)
@@ -85,6 +73,7 @@ class HTMLReader:
         else:
             return None
 class FileReader:
     def __init__(self, logger, kind):
         self.logger = logger
@@ -96,7 +85,9 @@ class FileReader:
         else:
             self.pdf_reader = PDFReader()
         self.web_reader = HTMLReader()
-        self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
     def extract_text_from_pdf(self, pdf_path):
         text = ""
@@ -137,7 +128,7 @@ class FileReader:
         return [Document(page_content=self.web_reader.read_html(url))]
     def read_tex_from_url(self, tex_url):
-        response = requests.get(tex_url)
         if response.status_code == 200:
             return [Document(page_content=response.text)]
         else:
@@ -154,17 +145,20 @@ class ChunkProcessor:
         self.document_metadata = {}
         self.document_chunks_full = []
-        if not config['vectorstore']['embedd_files']:
             self.load_document_data()
         if config["splitter_options"]["use_splitter"]:
             if config["splitter_options"]["chunking_mode"] == "fixed":
                 if config["splitter_options"]["split_by_token"]:
-                    self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-                        chunk_size=config["splitter_options"]["chunk_size"],
-                        chunk_overlap=config["splitter_options"]["chunk_overlap"],
-                        separators=config["splitter_options"]["chunk_separators"],
-                        disallowed_special=(),
                     )
                 else:
                     self.splitter = RecursiveCharacterTextSplitter(
@@ -175,8 +169,7 @@ class ChunkProcessor:
                     )
             else:
                 self.splitter = SemanticChunker(
-                    OpenAIEmbeddings(),
-                    breakpoint_threshold_type="percentile"
                 )
         else:
@@ -203,7 +196,10 @@ class ChunkProcessor:
     ):
         # TODO: Clear up this pipeline of re-adding metadata
         documents = [Document(page_content=documents, source=source, page=page)]
-        if file_type == "pdf" and self.config["splitter_options"]["chunking_mode"] == "fixed":
             document_chunks = documents
         else:
             document_chunks = self.splitter.split_documents(documents)
@@ -229,6 +225,20 @@ class ChunkProcessor:
             "https://dl4ds.github.io/sp2024/lectures/",
             "https://dl4ds.github.io/sp2024/schedule/",
         )  # For any additional metadata
         with ThreadPoolExecutor() as executor:
             executor.map(
                 self.process_file,
@@ -298,6 +308,7 @@ class ChunkProcessor:
         self.document_metadata[file_path] = file_metadata
     def process_file(self, file_path, file_index, file_reader, addl_metadata):
         file_name = os.path.basename(file_path)
         file_type = file_name.split(".")[-1]
@@ -314,10 +325,13 @@ class ChunkProcessor:
             return
         try:
             if file_path in self.document_data:
                 self.logger.warning(f"File {file_name} already processed")
-                documents = [Document(page_content=content) for content in self.document_data[file_path].values()]
             else:
                 documents = read_methods[file_type](file_path)
@@ -370,22 +384,31 @@ class ChunkProcessor:
             json.dump(self.document_metadata, json_file, indent=4)
     def load_document_data(self):
-        with open(
-            f"{self.config['log_chunk_dir']}/docs/doc_content.json", "r"
-        ) as json_file:
-            self.document_data = json.load(json_file)
-        with open(
-            f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
-        ) as json_file:
-            self.document_metadata = json.load(json_file)
-        self.logger.info(
-            f"Loaded document content from {self.config['log_chunk_dir']}/docs/doc_content.json. Total documents: {len(self.document_data)}"
-        )
 class DataLoader:
     def __init__(self, config, logger=None):
-        self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
         self.chunk_processor = ChunkProcessor(config, logger=logger)
     def get_chunks(self, uploaded_files, weblinks):
@@ -403,19 +426,22 @@ if __name__ == "__main__":
     with open("../code/modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
-    STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
     uploaded_files = [
-        os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
     ]
     data_loader = DataLoader(config, logger=logger)
     document_chunks, document_names, documents, document_metadata = (
         data_loader.get_chunks(
-            ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
             [],
         )
     )
     print(document_names[:5])
     print(len(document_chunks))

 import requests
 import pysrt
 from langchain_community.document_loaders import (
     Docx2txtLoader,
     YoutubeLoader,
     TextLoader,
 )
 from langchain.schema import Document
 import logging
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_experimental.text_splitter import SemanticChunker
 from langchain_openai.embeddings import OpenAIEmbeddings
 import json
 from concurrent.futures import ThreadPoolExecutor
 from urllib.parse import urljoin
 import html2text
 import bs4
 import PyPDF2
 from modules.dataloader.pdf_readers.base import PDFReader
 from modules.dataloader.pdf_readers.llama import LlamaParser
 from modules.dataloader.pdf_readers.gpt import GPTParser
+from modules.dataloader.helpers import get_metadata
+from modules.config.constants import TIMEOUT
 logger = logging.getLogger(__name__)
 BASE_DIR = os.getcwd()
         pass
     def read_url(self, url):
+        response = requests.get(url, timeout=TIMEOUT)
         if response.status_code == 200:
             return response.text
         else:
                 href = href.replace("http", "https")
             absolute_url = urljoin(base_url, href)
+            link["href"] = absolute_url
+            resp = requests.head(absolute_url, timeout=TIMEOUT)
             if resp.status_code != 200:
+                logger.warning(
+                    f"Link {absolute_url} is broken. Status code: {resp.status_code}"
+                )
         return str(soup)
         else:
             return None
 class FileReader:
     def __init__(self, logger, kind):
         self.logger = logger
         else:
             self.pdf_reader = PDFReader()
         self.web_reader = HTMLReader()
+        self.logger.info(
+            f"Initialized FileReader with {kind} PDF reader and HTML reader"
+        )
     def extract_text_from_pdf(self, pdf_path):
         text = ""
         return [Document(page_content=self.web_reader.read_html(url))]
     def read_tex_from_url(self, tex_url):
+        response = requests.get(tex_url, timeout=TIMEOUT)
         if response.status_code == 200:
             return [Document(page_content=response.text)]
         else:
         self.document_metadata = {}
         self.document_chunks_full = []
+        # TODO: Fix when reparse_files is False
+        if not config["vectorstore"]["reparse_files"]:
             self.load_document_data()
         if config["splitter_options"]["use_splitter"]:
             if config["splitter_options"]["chunking_mode"] == "fixed":
                 if config["splitter_options"]["split_by_token"]:
+                    self.splitter = (
+                        RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+                            chunk_size=config["splitter_options"]["chunk_size"],
+                            chunk_overlap=config["splitter_options"]["chunk_overlap"],
+                            separators=config["splitter_options"]["chunk_separators"],
+                            disallowed_special=(),
+                        )
                     )
                 else:
                     self.splitter = RecursiveCharacterTextSplitter(
                     )
             else:
                 self.splitter = SemanticChunker(
+                    OpenAIEmbeddings(), breakpoint_threshold_type="percentile"
                 )
         else:
     ):
         # TODO: Clear up this pipeline of re-adding metadata
         documents = [Document(page_content=documents, source=source, page=page)]
+        if (
+            file_type == "pdf"
+            and self.config["splitter_options"]["chunking_mode"] == "fixed"
+        ):
             document_chunks = documents
         else:
             document_chunks = self.splitter.split_documents(documents)
             "https://dl4ds.github.io/sp2024/lectures/",
             "https://dl4ds.github.io/sp2024/schedule/",
         )  # For any additional metadata
+        # remove already processed files if reparse_files is False
+        if not self.config["vectorstore"]["reparse_files"]:
+            total_documents = len(uploaded_files) + len(weblinks)
+            uploaded_files = [
+                file_path
+                for file_path in uploaded_files
+                if file_path not in self.document_data
+            ]
+            weblinks = [link for link in weblinks if link not in self.document_data]
+            print(
+                f"Total documents to process: {total_documents}, Documents already processed: {total_documents - len(uploaded_files) - len(weblinks)}"
+            )
         with ThreadPoolExecutor() as executor:
             executor.map(
                 self.process_file,
         self.document_metadata[file_path] = file_metadata
     def process_file(self, file_path, file_index, file_reader, addl_metadata):
+        print(f"Processing file {file_index + 1} : {file_path}")
         file_name = os.path.basename(file_path)
         file_type = file_name.split(".")[-1]
             return
         try:
             if file_path in self.document_data:
                 self.logger.warning(f"File {file_name} already processed")
+                documents = [
+                    Document(page_content=content)
+                    for content in self.document_data[file_path].values()
+                ]
             else:
                 documents = read_methods[file_type](file_path)
             json.dump(self.document_metadata, json_file, indent=4)
     def load_document_data(self):
+        try:
+            with open(
+                f"{self.config['log_chunk_dir']}/docs/doc_content.json", "r"
+            ) as json_file:
+                self.document_data = json.load(json_file)
+            with open(
+                f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
+            ) as json_file:
+                self.document_metadata = json.load(json_file)
+            self.logger.info(
+                f"Loaded document content from {self.config['log_chunk_dir']}/docs/doc_content.json. Total documents: {len(self.document_data)}"
+            )
+        except FileNotFoundError:
+            self.logger.warning(
+                f"Document content not found in {self.config['log_chunk_dir']}/docs/doc_content.json"
+            )
+            self.document_data = {}
+            self.document_metadata = {}
 class DataLoader:
     def __init__(self, config, logger=None):
+        self.file_reader = FileReader(
+            logger=logger, kind=config["llm_params"]["pdf_reader"]
+        )
         self.chunk_processor = ChunkProcessor(config, logger=logger)
     def get_chunks(self, uploaded_files, weblinks):
     with open("../code/modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
+    STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
     uploaded_files = [
+        os.path.join(STORAGE_DIR, file)
+        for file in os.listdir(STORAGE_DIR)
+        if file != "urls.txt"
     ]
     data_loader = DataLoader(config, logger=logger)
     document_chunks, document_names, documents, document_metadata = (
         data_loader.get_chunks(
+            [
+                "https://dl4ds.github.io/fa2024/static_files/discussion_slides/00_discussion.pdf"
+            ],
             [],
         )
     )
     print(document_names[:5])
     print(len(document_chunks))

code/modules/dataloader/helpers.py CHANGED Viewed

@@ -2,6 +2,8 @@ import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 import tempfile
 def get_urls_from_file(file_path: str):
     """
@@ -26,11 +28,11 @@ def get_metadata(lectures_url, schedule_url):
     lecture_metadata = {}
     # Get the main lectures page content
-    r_lectures = requests.get(lectures_url)
     soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
     # Get the main schedule page content
-    r_schedule = requests.get(schedule_url)
     soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
     # Find all lecture blocks
@@ -118,7 +120,7 @@ def download_pdf_from_url(pdf_url):
     Returns:
         str: The local file path of the downloaded PDF file.
     """
-    response = requests.get(pdf_url)
     if response.status_code == 200:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
             temp_file.write(response.content)

 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 import tempfile
+from modules.config.constants import TIMEOUT
 def get_urls_from_file(file_path: str):
     """
     lecture_metadata = {}
     # Get the main lectures page content
+    r_lectures = requests.get(lectures_url, timeout=TIMEOUT)
     soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
     # Get the main schedule page content
+    r_schedule = requests.get(schedule_url, timeout=TIMEOUT)
     soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
     # Find all lecture blocks
     Returns:
         str: The local file path of the downloaded PDF file.
     """
+    response = requests.get(pdf_url, timeout=TIMEOUT)
     if response.status_code == 200:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
             temp_file.write(response.content)

code/modules/dataloader/pdf_readers/gpt.py CHANGED Viewed

@@ -6,6 +6,7 @@ from io import BytesIO
 from openai import OpenAI
 from pdf2image import convert_from_path
 from langchain.schema import Document
 class GPTParser:
@@ -19,9 +20,9 @@ class GPTParser:
         self.api_key = os.getenv("OPENAI_API_KEY")
         self.prompt = """
          The provided documents are images of PDFs of lecture slides of deep learning material.
-         They contain LaTeX equations, images, and text.
          The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
-         The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
          For images, give a description and if you can, a source. Separate each page with '---'.
          Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
          """
@@ -31,36 +32,45 @@ class GPTParser:
         encoded_images = [self.encode_image(image) for image in images]
-        chunks = [encoded_images[i:i + 5] for i in range(0, len(encoded_images), 5)]
         headers = {
             "Content-Type": "application/json",
-            "Authorization": f"Bearer {self.api_key}"
         }
         output = ""
         for chunk_num, chunk in enumerate(chunks):
-            content = [{"type": "image_url", "image_url": {
-                "url": f"data:image/jpeg;base64,{image}"}} for image in chunk]
             content.insert(0, {"type": "text", "text": self.prompt})
             payload = {
                 "model": "gpt-4o-mini",
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": content
-                    }
-                ],
             }
             response = requests.post(
-                "https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
             resp = response.json()
-            chunk_output = resp['choices'][0]['message']['content'].replace("```", "").replace("markdown", "").replace("````", "")
             output += chunk_output + "\n---\n"
@@ -68,14 +78,12 @@ class GPTParser:
         output = [doc for doc in output if doc.strip() != ""]
         documents = [
-            Document(
-                page_content=page,
-                metadata={"source": pdf_path, "page": i}
-            ) for i, page in enumerate(output)
         ]
         return documents
     def encode_image(self, image):
         buffered = BytesIO()
         image.save(buffered, format="JPEG")
-        return base64.b64encode(buffered.getvalue()).decode('utf-8')

 from openai import OpenAI
 from pdf2image import convert_from_path
 from langchain.schema import Document
+from modules.config.constants import TIMEOUT
 class GPTParser:
         self.api_key = os.getenv("OPENAI_API_KEY")
         self.prompt = """
          The provided documents are images of PDFs of lecture slides of deep learning material.
+         They contain LaTeX equations, images, and text.
          The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
+         The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
          For images, give a description and if you can, a source. Separate each page with '---'.
          Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
          """
         encoded_images = [self.encode_image(image) for image in images]
+        chunks = [encoded_images[i : i + 5] for i in range(0, len(encoded_images), 5)]
         headers = {
             "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
         }
         output = ""
         for chunk_num, chunk in enumerate(chunks):
+            content = [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{image}"},
+                }
+                for image in chunk
+            ]
             content.insert(0, {"type": "text", "text": self.prompt})
             payload = {
                 "model": "gpt-4o-mini",
+                "messages": [{"role": "user", "content": content}],
             }
             response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json=payload,
+                timeout=TIMEOUT,
+            )
             resp = response.json()
+            chunk_output = (
+                resp["choices"][0]["message"]["content"]
+                .replace("```", "")
+                .replace("markdown", "")
+                .replace("````", "")
+            )
             output += chunk_output + "\n---\n"
         output = [doc for doc in output if doc.strip() != ""]
         documents = [
+            Document(page_content=page, metadata={"source": pdf_path, "page": i})
+            for i, page in enumerate(output)
         ]
         return documents
     def encode_image(self, image):
         buffered = BytesIO()
         image.save(buffered, format="JPEG")
+        return base64.b64encode(buffered.getvalue()).decode("utf-8")

code/modules/dataloader/pdf_readers/llama.py CHANGED Viewed

@@ -2,19 +2,18 @@ import os
 import requests
 from llama_parse import LlamaParse
 from langchain.schema import Document
-from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
 from modules.dataloader.helpers import download_pdf_from_url
 class LlamaParser:
     def __init__(self):
         self.GPT_API_KEY = OPENAI_API_KEY
         self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
         self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
         self.headers = {
-            'Accept': 'application/json',
-            'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
         }
         self.parser = LlamaParse(
             api_key=LLAMA_CLOUD_API_KEY,
@@ -23,7 +22,7 @@ class LlamaParser:
             language="en",
             gpt4o_mode=False,
             # gpt4o_api_key=OPENAI_API_KEY,
-            parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
         )
     def parse(self, pdf_path):
@@ -38,10 +37,8 @@ class LlamaParser:
         pages = [page.strip() for page in pages]
         documents = [
-            Document(
-                page_content=page,
-                metadata={"source": pdf_path, "page": i}
-            ) for i, page in enumerate(pages)
         ]
         return documents
@@ -53,20 +50,30 @@ class LlamaParser:
         }
         files = [
-            ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
         ]
         response = requests.request(
-            "POST", self.parse_url, headers=self.headers, data=payload, files=files)
-        return response.json()['id'], response.json()['status']
     async def get_result(self, job_id):
-        url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
         response = requests.request("GET", url, headers=self.headers, data={})
-        return response.json()['markdown']
     async def _parse(self, pdf_path):
         job_id, status = self.make_request(pdf_path)
@@ -78,15 +85,9 @@ class LlamaParser:
         result = await self.get_result(job_id)
-        documents = [
-            Document(
-                page_content=result,
-                metadata={"source": pdf_path}
-            )
-        ]
         return documents
-    async def _parse(self, pdf_path):
-        return await self._parse(pdf_path)

 import requests
 from llama_parse import LlamaParse
 from langchain.schema import Document
+from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY, TIMEOUT
 from modules.dataloader.helpers import download_pdf_from_url
 class LlamaParser:
     def __init__(self):
         self.GPT_API_KEY = OPENAI_API_KEY
         self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
         self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
         self.headers = {
+            "Accept": "application/json",
+            "Authorization": f"Bearer {LLAMA_CLOUD_API_KEY}",
         }
         self.parser = LlamaParse(
             api_key=LLAMA_CLOUD_API_KEY,
             language="en",
             gpt4o_mode=False,
             # gpt4o_api_key=OPENAI_API_KEY,
+            parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source.",
         )
     def parse(self, pdf_path):
         pages = [page.strip() for page in pages]
         documents = [
+            Document(page_content=page, metadata={"source": pdf_path, "page": i})
+            for i, page in enumerate(pages)
         ]
         return documents
         }
         files = [
+            (
+                "file",
+                (
+                    "file",
+                    requests.get(pdf_url, timeout=TIMEOUT).content,
+                    "application/octet-stream",
+                ),
+            )
         ]
         response = requests.request(
+            "POST", self.parse_url, headers=self.headers, data=payload, files=files
+        )
+        return response.json()["id"], response.json()["status"]
     async def get_result(self, job_id):
+        url = (
+            f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
+        )
         response = requests.request("GET", url, headers=self.headers, data={})
+        return response.json()["markdown"]
     async def _parse(self, pdf_path):
         job_id, status = self.make_request(pdf_path)
         result = await self.get_result(job_id)
+        documents = [Document(page_content=result, metadata={"source": pdf_path})]
         return documents
+    # async def _parse(self, pdf_path):
+    #     return await self._parse(pdf_path)

code/modules/dataloader/webpage_crawler.py CHANGED Viewed

@@ -3,7 +3,9 @@ from aiohttp import ClientSession
 import asyncio
 import requests
 from bs4 import BeautifulSoup
-from urllib.parse import urlparse, urljoin, urldefrag
 class WebpageCrawler:
     def __init__(self):
@@ -18,7 +20,7 @@ class WebpageCrawler:
     def url_exists(self, url: str) -> bool:
         try:
-            response = requests.head(url)
             return response.status_code == 200
         except requests.ConnectionError:
             return False
@@ -88,7 +90,7 @@ class WebpageCrawler:
     def is_webpage(self, url: str) -> bool:
         try:
-            response = requests.head(url, allow_redirects=True)
             content_type = response.headers.get("Content-Type", "").lower()
             return "text/html" in content_type
         except requests.RequestException:

 import asyncio
 import requests
 from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urldefrag
+from modules.config.constants import TIMEOUT
 class WebpageCrawler:
     def __init__(self):
     def url_exists(self, url: str) -> bool:
         try:
+            response = requests.head(url, timeout=TIMEOUT)
             return response.status_code == 200
         except requests.ConnectionError:
             return False
     def is_webpage(self, url: str) -> bool:
         try:
+            response = requests.head(url, allow_redirects=True, timeout=TIMEOUT)
             content_type = response.headers.get("Content-Type", "").lower()
             return "text/html" in content_type
         except requests.RequestException:

code/modules/vectorstore/colbert.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from ragatouille import RAGPretrainedModel
 from modules.vectorstore.base import VectorStoreBase
 from langchain_core.retrievers import BaseRetriever
-from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun, Callbacks
 from langchain_core.documents import Document
-from typing import Any, List, Optional, Sequence
 import os
 import json
@@ -85,6 +85,7 @@ class ColbertVectorStore(VectorStoreBase):
             document_ids=document_names,
             document_metadatas=document_metadata,
         )
         self.colbert.set_document_count(len(document_names))
     def load_database(self):

 from ragatouille import RAGPretrainedModel
 from modules.vectorstore.base import VectorStoreBase
 from langchain_core.retrievers import BaseRetriever
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
 from langchain_core.documents import Document
+from typing import Any, List
 import os
 import json
             document_ids=document_names,
             document_metadatas=document_metadata,
         )
+        print(f"Index created at {index_path}")
         self.colbert.set_document_count(len(document_names))
     def load_database(self):

code/modules/vectorstore/embedding_model_loader.py CHANGED Viewed

@@ -1,9 +1,6 @@
 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.embeddings import LlamaCppEmbeddings
-from modules.config.constants import *
-import os
 class EmbeddingModelLoader:
@@ -28,8 +25,5 @@ class EmbeddingModelLoader:
                     "trust_remote_code": True,
                 },
             )
-            # embedding_model = LlamaCppEmbeddings(
-            #     model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
-            # )
         return embedding_model

 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
+from modules.config.constants import OPENAI_API_KEY, HUGGINGFACE_TOKEN
 class EmbeddingModelLoader:
                     "trust_remote_code": True,
                 },
             )
         return embedding_model

code/modules/vectorstore/faiss.py CHANGED Viewed

@@ -14,10 +14,15 @@ class FaissVectorStore(VectorStoreBase):
     def __init__(self, config):
         self.config = config
         self._init_vector_db()
-        self.local_path = os.path.join(self.config["vectorstore"]["db_path"],
-                                       "db_" + self.config["vectorstore"]["db_option"]
-                                       + "_" + self.config["vectorstore"]["model"]
-                                       + "_" + config["splitter_options"]["chunking_mode"])
     def _init_vector_db(self):
         self.faiss = FAISS(
@@ -28,9 +33,7 @@ class FaissVectorStore(VectorStoreBase):
         self.vectorstore = self.faiss.from_documents(
             documents=document_chunks, embedding=embedding_model
         )
-        self.vectorstore.save_local(
-            self.local_path
-        )
     def load_database(self, embedding_model):
         self.vectorstore = self.faiss.load_local(

     def __init__(self, config):
         self.config = config
         self._init_vector_db()
+        self.local_path = os.path.join(
+            self.config["vectorstore"]["db_path"],
+            "db_"
+            + self.config["vectorstore"]["db_option"]
+            + "_"
+            + self.config["vectorstore"]["model"]
+            + "_"
+            + config["splitter_options"]["chunking_mode"],
+        )
     def _init_vector_db(self):
         self.faiss = FAISS(
         self.vectorstore = self.faiss.from_documents(
             documents=document_chunks, embedding=embedding_model
         )
+        self.vectorstore.save_local(self.local_path)
     def load_database(self, embedding_model):
         self.vectorstore = self.faiss.load_local(

code/modules/vectorstore/raptor.py CHANGED Viewed

@@ -317,13 +317,10 @@ class RAPTORVectoreStore(VectorStoreBase):
         print(f"--Generated {len(all_clusters)} clusters--")
         # Summarization
-        template = """Here is content from the course DS598: Deep Learning for Data Science.
         The content may be form webapge about the course, or lecture content, or any other relevant information.
         If the content is in bullet points (from  pdf lectre slides), you can summarize the bullet points.
         Give a detailed summary of the content below.
         Documentation:
         {context}
         """

         print(f"--Generated {len(all_clusters)} clusters--")
         # Summarization
+        template = """Here is content from the course DS598: Deep Learning for Data Science.
         The content may be form webapge about the course, or lecture content, or any other relevant information.
         If the content is in bullet points (from  pdf lectre slides), you can summarize the bullet points.
         Give a detailed summary of the content below.
         Documentation:
         {context}
         """

code/modules/vectorstore/store_manager.py CHANGED Viewed

@@ -1,9 +1,7 @@
 from modules.vectorstore.vectorstore import VectorStore
-from modules.vectorstore.helpers import *
 from modules.dataloader.webpage_crawler import WebpageCrawler
 from modules.dataloader.data_loader import DataLoader
-from modules.dataloader.helpers import *
-from modules.config.constants import RETRIEVER_HF_PATHS
 from modules.vectorstore.embedding_model_loader import EmbeddingModelLoader
 import logging
 import os
@@ -117,7 +115,7 @@ class VectorStoreManager:
         )
         num_documents = len(document_chunks)
         self.logger.info(f"Number of documents in the DB: {num_documents}")
-        metadata_keys = list(document_metadata[0].keys())
         self.logger.info(f"Metadata keys: {metadata_keys}")
         self.logger.info("Completed loading data")
         self.initialize_database(
@@ -170,13 +168,21 @@ if __name__ == "__main__":
     with open("modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
     print(config)
     print(f"Trying to create database with config: {config}")
     vector_db = VectorStoreManager(config)
     if config["vectorstore"]["load_from_HF"]:
-        if config["vectorstore"]["db_option"] in RETRIEVER_HF_PATHS:
             vector_db.load_from_HF(
-                HF_PATH=RETRIEVER_HF_PATHS[config["vectorstore"]["db_option"]]
             )
         else:
             # print(f"HF_PATH not available for {config['vectorstore']['db_option']}")
@@ -189,7 +195,7 @@ if __name__ == "__main__":
         vector_db.create_database()
     print("Created database")
-    print(f"Trying to load the database")
     vector_db = VectorStoreManager(config)
     vector_db.load_database()
     print("Loaded database")

 from modules.vectorstore.vectorstore import VectorStore
+from modules.dataloader.helpers import get_urls_from_file
 from modules.dataloader.webpage_crawler import WebpageCrawler
 from modules.dataloader.data_loader import DataLoader
 from modules.vectorstore.embedding_model_loader import EmbeddingModelLoader
 import logging
 import os
         )
         num_documents = len(document_chunks)
         self.logger.info(f"Number of documents in the DB: {num_documents}")
+        metadata_keys = list(document_metadata[0].keys()) if document_metadata else []
         self.logger.info(f"Metadata keys: {metadata_keys}")
         self.logger.info("Completed loading data")
         self.initialize_database(
     with open("modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
+    with open("modules/config/user_config.yml", "r") as f:
+        user_config = yaml.safe_load(f)
     print(config)
+    print(user_config)
     print(f"Trying to create database with config: {config}")
     vector_db = VectorStoreManager(config)
     if config["vectorstore"]["load_from_HF"]:
+        if (
+            config["vectorstore"]["db_option"]
+            in user_config["retriever"]["retriever_hf_paths"]
+        ):
             vector_db.load_from_HF(
+                HF_PATH=user_config["retriever"]["retriever_hf_paths"][
+                    config["vectorstore"]["db_option"]
+                ]
             )
         else:
             # print(f"HF_PATH not available for {config['vectorstore']['db_option']}")
         vector_db.create_database()
     print("Created database")
+    print("Trying to load the database")
     vector_db = VectorStoreManager(config)
     vector_db.load_database()
     print("Loaded database")

docs/README.md DELETED Viewed

@@ -1,51 +0,0 @@
-# Documentation
-## File Structure:
-- `docs/` - Documentation files
-- `code/` - Code files
-- `storage/` - Storage files
-- `vectorstores/` - Vector Databases
-- `.env` - Environment Variables
-- `Dockerfile` - Dockerfile for Hugging Face
-- `.chainlit` - Chainlit Configuration
-- `chainlit.md` - Chainlit README
-- `README.md` - Repository README
-- `.gitignore` - Gitignore file
-- `requirements.txt` - Python Requirements
-- `.gitattributes` - Gitattributes file
-## Code Structure
-- `code/main.py` - Main Chainlit App
-- `code/config.yaml` - Configuration File to set Embedding related, Vector Database related, and Chat Model related parameters.
-- `code/modules/vector_db.py` - Vector Database Creation
-- `code/modules/chat_model_loader.py` - Chat Model Loader (Creates the Chat Model)
-- `code/modules/constants.py` - Constants (Loads the Environment Variables, Prompts, Model Paths, etc.)
-- `code/modules/data_loader.py` - Loads and Chunks the Data
-- `code/modules/embedding_model.py` - Creates the Embedding Model to Embed the Data
-- `code/modules/llm_tutor.py` - Creates the RAG LLM Tutor
-    - The Function `qa_bot()` loads the vector database and the chat model, and sets the prompt to pass to the chat model.
-- `code/modules/helpers.py` - Helper Functions
-## Storage and Vectorstores
-- `storage/data/` - Data Storage (Put your pdf files under this directory, and urls in the urls.txt file)
-- `storage/models/` - Model Storage (Put your local LLMs under this directory)
-- `vectorstores/` - Vector Databases (Stores the Vector Databases generated from `code/modules/vector_db.py`)
-## Useful Configurations
-set these in `code/config.yaml`:
-* ``["embedding_options"]["embedd_files"]`` - If set to True, embeds the files from the storage directory everytime you run the chainlit command. If set to False, uses the stored vector database.
-* ``["embedding_options"]["expand_urls"]`` - If set to True, gets and reads the data from all the links under the url provided. If set to False, only reads the data in the url provided.
-* ``["embedding_options"]["search_top_k"]`` - Number of sources that the retriever returns
-* ``["llm_params]["use_history"]`` - Whether to use history in the prompt or not
-* ``["llm_params]["memory_window"]`` - Number of interactions to keep a track of in the history
-## LlamaCpp
-* https://python.langchain.com/docs/integrations/llms/llamacpp
-## Hugging Face Models
-* Download the ``.gguf`` files for your Local LLM from Hugging Face (Example: https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF)

docs/contribute.md ADDED Viewed

	@@ -0,0 +1,33 @@

+💡 **Please ensure formatting, linting, and security checks pass before submitting a pull request**
+## Code Formatting
+The codebase is formatted using [black](https://github.com/psf/black)
+To format the codebase, run the following command:
+```bash
+black .
+```
+Please ensure that the code is formatted before submitting a pull request.
+## Linting
+The codebase is linted using [flake8](https://flake8.pycqa.org/en/latest/)
+To view the linting errors, run the following command:
+```bash
+flake8 .
+```
+## Security and Vulnerabilities
+The codebase is scanned for security vulnerabilities using [bandit](https://github.com/PyCQA/bandit)
+To scan the codebase for security vulnerabilities, run the following command:
+```bash
+bandit -r .
+```

docs/setup.md ADDED Viewed

	@@ -0,0 +1,127 @@

+# Initial Setup
+⚠️ **Create the .env file inside the `code/` directory.**
+## Python Environment
+Python Version: 3.11
+Create a virtual environment and install the required packages:
+```bash
+conda create -n ai_tutor python=3.11
+conda activate ai_tutor
+pip install -r requirements.txt
+```
+## Code Formatting
+The codebase is formatted using [black](https://github.com/psf/black), and if making changes to the codebase, ensure that the code is formatted before submitting a pull request. More instructions can be found in `docs/contribute.md`.
+## Google OAuth 2.0 Client ID and Secret
+To set up the Google OAuth 2.0 Client ID and Secret, follow these steps:
+1. Go to the [Google Cloud Console](https://console.cloud.google.com/apis/credentials).
+2. Create a new project or select an existing one.
+3. Navigate to the "Credentials" page.
+4. Click on "Create Credentials" and select "OAuth 2.0 Client ID".
+5. Configure the OAuth consent screen if you haven't already.
+6. Choose "Web application" as the application type.
+7. Configure the redirect URIs as needed.
+8. Copy the generated `Client ID` and `Client Secret`.
+Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
+```bash
+OAUTH_GOOGLE_CLIENT_ID=<your_client_id>
+OAUTH_GOOGLE_CLIENT_SECRET=<your_client_secret>
+```
+## Literal AI API Key
+To obtain the Literal AI API key:
+1. Sign up or log in to [Literal AI](https://cloud.getliteral.ai/).
+2. Navigate to the API Keys section under your account settings.
+3. Create a new API key if necessary and copy it.
+Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
+```bash
+LITERAL_API_KEY_LOGGING=<your_api_key>
+LITERAL_API_URL=https://cloud.getliteral.ai
+```
+## LlamaCloud API Key
+To obtain the LlamaCloud API Key:
+1. Go to [LlamaCloud](https://cloud.llamaindex.ai/).
+2. Sign up or log in to your account.
+3. Navigate to the API section and generate a new API key if necessary.
+Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
+```bash
+LLAMA_CLOUD_API_KEY=<your_api_key>
+```
+## Hugging Face Access Token
+To obtain your Hugging Face access token:
+1. Go to [Hugging Face settings](https://huggingface.co/settings/tokens).
+2. Log in or create an account.
+3. Generate a new token or use an existing one.
+Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
+```bash
+HUGGINGFACE_TOKEN=<your-huggingface-token>
+```
+## Chainlit Authentication Secret
+You must provide a JWT secret in the environment to use authentication. Run `chainlit create-secret` to generate one.
+```bash
+chainlit create-secret
+```
+Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
+```bash
+CHAINLIT_AUTH_SECRET=<your_jwt_secret>
+CHAINLIT_URL=<your_chainlit_url> # Example: CHAINLIT_URL=http://localhost:8000
+```
+## OpenAI API Key
+Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
+```bash
+OPENAI_API_KEY=<your_openai_api_key>
+```
+## In a Nutshell
+Your .env file (secrets in HuggingFace) should look like this:
+```bash
+CHAINLIT_AUTH_SECRET=<your_jwt_secret>
+OPENAI_API_KEY=<your_openai_api_key>
+HUGGINGFACE_TOKEN=<your-huggingface-token>
+LITERAL_API_KEY_LOGGING=<your_api_key>
+LITERAL_API_URL=<https://cloud.getliteral.ai>
+OAUTH_GOOGLE_CLIENT_ID=<your_client_id>
+OAUTH_GOOGLE_CLIENT_SECRET=<your_client_secret>
+LLAMA_CLOUD_API_KEY=<your_api_key>
+CHAINLIT_URL=<your_chainlit_url>
+```
+# Configuration
+The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
+The configuration file `code/modules/user_config.yaml` contains user-defined parameters.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [tool.black]
2	+ line-length = 88

requirements.txt CHANGED Viewed

@@ -27,3 +27,6 @@ langchain_experimental
 html2text
 PyPDF2
 pdf2image

 html2text
 PyPDF2
 pdf2image
+black
+flake8
+bandit