Thomas (Tom) Gardos commited on
Commit
36e2567
2 Parent(s): 8b05bb6 b2b3fe2

Merge pull request #58 from DL4DS/setup_and_format_instructions

Browse files
.flake8 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [flake8]
2
+ max-line-length = 88
3
+ extend-ignore = E203, E266, E501, W503
.github/workflows/code_quality_check.yml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Code Quality and Security Checks
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, dev_branch ]
6
+ pull_request:
7
+ branches: [ main, dev_branch ]
8
+
9
+ jobs:
10
+ code-quality:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+
15
+ - name: Set up Python
16
+ uses: actions/setup-python@v4
17
+ with:
18
+ python-version: '3.11'
19
+
20
+ - name: Install dependencies
21
+ run: |
22
+ python -m pip install --upgrade pip
23
+ pip install flake8 black bandit
24
+
25
+ - name: Run Black
26
+ run: black --check .
27
+
28
+ - name: Run Flake8
29
+ run: flake8 .
30
+
31
+ - name: Run Bandit
32
+ run: |
33
+ bandit -r .
.gitignore CHANGED
@@ -165,6 +165,7 @@ cython_debug/
165
  .ragatouille/*
166
  */__pycache__/*
167
  .chainlit/translations/
 
168
  storage/logs/*
169
  vectorstores/*
170
 
 
165
  .ragatouille/*
166
  */__pycache__/*
167
  .chainlit/translations/
168
+ code/.chainlit/translations/
169
  storage/logs/*
170
  vectorstores/*
171
 
README.md CHANGED
@@ -15,6 +15,8 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
15
 
16
  ## Running Locally
17
 
 
 
18
  1. **Clone the Repository**
19
  ```bash
20
  git clone https://github.com/DL4DS/dl4ds_tutor
@@ -36,7 +38,6 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
36
  python -m modules.vectorstore.store_manager
37
  ```
38
  - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
39
- - Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command.
40
 
41
  5. **Run the Chainlit App**
42
  ```bash
@@ -90,4 +91,6 @@ docker run -it --rm -p 8000:8000 dev
90
 
91
  ## Contributing
92
 
93
- Please create an issue if you have any suggestions or improvements, and start working on it by creating a branch and by making a pull request to the main branch.
 
 
 
15
 
16
  ## Running Locally
17
 
18
+ Please view `docs/setup.md` for more information on setting up the project.
19
+
20
  1. **Clone the Repository**
21
  ```bash
22
  git clone https://github.com/DL4DS/dl4ds_tutor
 
38
  python -m modules.vectorstore.store_manager
39
  ```
40
  - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
 
41
 
42
  5. **Run the Chainlit App**
43
  ```bash
 
91
 
92
  ## Contributing
93
 
94
+ Please create an issue if you have any suggestions or improvements, and start working on it by creating a branch and by making a pull request to the main branch.
95
+
96
+ Please view `docs/contribute.md` for more information on contributing.
code/.chainlit/config.toml CHANGED
@@ -49,6 +49,8 @@ auto_tag_thread = true
49
  # Sample rate of the audio
50
  sample_rate = 44100
51
 
 
 
52
  [UI]
53
  # Name of the assistant.
54
  name = "AI Tutor"
@@ -115,4 +117,4 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
115
  #secondary = "#BDBDBD"
116
 
117
  [meta]
118
- generated_by = "1.1.304"
 
49
  # Sample rate of the audio
50
  sample_rate = 44100
51
 
52
+ edit_message = true
53
+
54
  [UI]
55
  # Name of the assistant.
56
  name = "AI Tutor"
 
117
  #secondary = "#BDBDBD"
118
 
119
  [meta]
120
+ generated_by = "1.1.306"
code/__init__.py DELETED
@@ -1 +0,0 @@
1
- from .modules import *
 
 
code/main.py CHANGED
@@ -1,7 +1,6 @@
1
  import chainlit.data as cl_data
2
  import asyncio
3
  from modules.config.constants import (
4
- LLAMA_PATH,
5
  LITERAL_API_KEY_LOGGING,
6
  LITERAL_API_URL,
7
  )
@@ -9,7 +8,6 @@ from modules.chat_processor.literal_ai import CustomLiteralDataLayer
9
 
10
  import json
11
  import yaml
12
- import os
13
  from typing import Any, Dict, no_type_check
14
  import chainlit as cl
15
  from modules.chat.llm_tutor import LLMTutor
@@ -73,7 +71,14 @@ class Chatbot:
73
  start_time = time.time()
74
 
75
  llm_settings = cl.user_session.get("llm_settings", {})
76
- chat_profile, retriever_method, memory_window, llm_style, generate_follow_up, chunking_mode = (
 
 
 
 
 
 
 
77
  llm_settings.get("chat_model"),
78
  llm_settings.get("retriever_method"),
79
  llm_settings.get("memory_window"),
@@ -113,8 +118,6 @@ class Chatbot:
113
  ),
114
  )
115
 
116
- tags = [chat_profile, self.config["vectorstore"]["db_option"]]
117
-
118
  cl.user_session.set("chain", self.chain)
119
  cl.user_session.set("llm_tutor", self.llm_tutor)
120
 
@@ -180,7 +183,7 @@ class Chatbot:
180
  cl.input_widget.Select(
181
  id="chunking_mode",
182
  label="Chunking mode",
183
- values=['fixed', 'semantic'],
184
  initial_index=1,
185
  ),
186
  cl.input_widget.Switch(
@@ -241,7 +244,8 @@ class Chatbot:
241
  ) # see if the thread has any steps
242
  if thread.steps or len(thread.steps) > 0:
243
  return None
244
- except:
 
245
  return [
246
  cl.Starter(
247
  label="recording on CNNs?",
@@ -294,10 +298,18 @@ class Chatbot:
294
 
295
  await self.make_llm_settings_widgets(self.config)
296
  user = cl.user_session.get("user")
297
- self.user = {
298
- "user_id": user.identifier,
299
- "session_id": cl.context.session.thread_id,
300
- }
 
 
 
 
 
 
 
 
301
 
302
  memory = cl.user_session.get("memory", [])
303
 
@@ -355,7 +367,7 @@ class Chatbot:
355
  llm_settings = cl.user_session.get("llm_settings", {})
356
  view_sources = llm_settings.get("view_sources", False)
357
  stream = llm_settings.get("stream_response", False)
358
- steam = False # Fix streaming
359
  user_query_dict = {"input": message.content}
360
  # Define the base configuration
361
  chain_config = {
 
1
  import chainlit.data as cl_data
2
  import asyncio
3
  from modules.config.constants import (
 
4
  LITERAL_API_KEY_LOGGING,
5
  LITERAL_API_URL,
6
  )
 
8
 
9
  import json
10
  import yaml
 
11
  from typing import Any, Dict, no_type_check
12
  import chainlit as cl
13
  from modules.chat.llm_tutor import LLMTutor
 
71
  start_time = time.time()
72
 
73
  llm_settings = cl.user_session.get("llm_settings", {})
74
+ (
75
+ chat_profile,
76
+ retriever_method,
77
+ memory_window,
78
+ llm_style,
79
+ generate_follow_up,
80
+ chunking_mode,
81
+ ) = (
82
  llm_settings.get("chat_model"),
83
  llm_settings.get("retriever_method"),
84
  llm_settings.get("memory_window"),
 
118
  ),
119
  )
120
 
 
 
121
  cl.user_session.set("chain", self.chain)
122
  cl.user_session.set("llm_tutor", self.llm_tutor)
123
 
 
183
  cl.input_widget.Select(
184
  id="chunking_mode",
185
  label="Chunking mode",
186
+ values=["fixed", "semantic"],
187
  initial_index=1,
188
  ),
189
  cl.input_widget.Switch(
 
244
  ) # see if the thread has any steps
245
  if thread.steps or len(thread.steps) > 0:
246
  return None
247
+ except Exception as e:
248
+ print(e)
249
  return [
250
  cl.Starter(
251
  label="recording on CNNs?",
 
298
 
299
  await self.make_llm_settings_widgets(self.config)
300
  user = cl.user_session.get("user")
301
+
302
+ try:
303
+ self.user = {
304
+ "user_id": user.identifier,
305
+ "session_id": cl.context.session.thread_id,
306
+ }
307
+ except Exception as e:
308
+ print(e)
309
+ self.user = {
310
+ "user_id": "guest",
311
+ "session_id": cl.context.session.thread_id,
312
+ }
313
 
314
  memory = cl.user_session.get("memory", [])
315
 
 
367
  llm_settings = cl.user_session.get("llm_settings", {})
368
  view_sources = llm_settings.get("view_sources", False)
369
  stream = llm_settings.get("stream_response", False)
370
+ stream = False # Fix streaming
371
  user_query_dict = {"input": message.content}
372
  # Define the base configuration
373
  chain_config = {
code/modules/chat/chat_model_loader.py CHANGED
@@ -1,15 +1,8 @@
1
  from langchain_openai import ChatOpenAI
2
- from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
3
- from transformers import AutoTokenizer, TextStreamer
4
  from langchain_community.llms import LlamaCpp
5
- import torch
6
- import transformers
7
  import os
8
  from pathlib import Path
9
  from huggingface_hub import hf_hub_download
10
- from langchain.callbacks.manager import CallbackManager
11
- from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
12
- from modules.config.constants import LLAMA_PATH
13
 
14
 
15
  class ChatModelLoader:
@@ -38,7 +31,7 @@ class ChatModelLoader:
38
  self.config["llm_params"]["local_llm_params"]["model"]
39
  )
40
  llm = LlamaCpp(
41
- model_path=LLAMA_PATH,
42
  n_batch=n_batch,
43
  n_ctx=2048,
44
  f16_kv=True,
 
1
  from langchain_openai import ChatOpenAI
 
 
2
  from langchain_community.llms import LlamaCpp
 
 
3
  import os
4
  from pathlib import Path
5
  from huggingface_hub import hf_hub_download
 
 
 
6
 
7
 
8
  class ChatModelLoader:
 
31
  self.config["llm_params"]["local_llm_params"]["model"]
32
  )
33
  llm = LlamaCpp(
34
+ model_path=model_path,
35
  n_batch=n_batch,
36
  n_ctx=2048,
37
  f16_kv=True,
code/modules/chat/langchain/__init__.py ADDED
File without changes
code/modules/chat/langchain/langchain_rag.py CHANGED
@@ -1,17 +1,22 @@
1
  from langchain_core.prompts import ChatPromptTemplate
2
 
3
- from modules.chat.langchain.utils import *
4
- from langchain.memory import ChatMessageHistory
5
  from modules.chat.base import BaseRAG
6
  from langchain_core.prompts import PromptTemplate
7
- from langchain.memory import (
8
- ConversationBufferWindowMemory,
9
- ConversationSummaryBufferMemory,
 
 
 
 
 
 
 
 
10
  )
11
 
12
- import chainlit as cl
13
- from langchain_community.chat_models import ChatOpenAI
14
-
15
 
16
  class Langchain_RAG_V1(BaseRAG):
17
 
 
1
  from langchain_core.prompts import ChatPromptTemplate
2
 
3
+ # from modules.chat.langchain.utils import
4
+ from langchain_community.chat_message_histories import ChatMessageHistory
5
  from modules.chat.base import BaseRAG
6
  from langchain_core.prompts import PromptTemplate
7
+ from langchain.memory import ConversationBufferWindowMemory
8
+ from langchain_core.runnables.utils import ConfigurableFieldSpec
9
+ from .utils import (
10
+ CustomConversationalRetrievalChain,
11
+ create_history_aware_retriever,
12
+ create_stuff_documents_chain,
13
+ create_retrieval_chain,
14
+ return_questions,
15
+ CustomRunnableWithHistory,
16
+ BaseChatMessageHistory,
17
+ InMemoryHistory,
18
  )
19
 
 
 
 
20
 
21
  class Langchain_RAG_V1(BaseRAG):
22
 
code/modules/chat/langchain/utils.py CHANGED
@@ -1,53 +1,29 @@
1
  from typing import Any, Dict, List, Union, Tuple, Optional
2
- from langchain_core.messages import (
3
- BaseMessage,
4
- AIMessage,
5
- FunctionMessage,
6
- HumanMessage,
7
- )
8
-
9
  from langchain_core.prompts.base import BasePromptTemplate, format_document
10
- from langchain_core.prompts.chat import MessagesPlaceholder
11
  from langchain_core.output_parsers import StrOutputParser
12
  from langchain_core.output_parsers.base import BaseOutputParser
13
  from langchain_core.retrievers import BaseRetriever, RetrieverOutput
14
  from langchain_core.language_models import LanguageModelLike
15
  from langchain_core.runnables import Runnable, RunnableBranch, RunnablePassthrough
16
  from langchain_core.runnables.history import RunnableWithMessageHistory
17
- from langchain_core.runnables.utils import ConfigurableFieldSpec
18
  from langchain_core.chat_history import BaseChatMessageHistory
19
  from langchain_core.pydantic_v1 import BaseModel, Field
20
  from langchain.chains.combine_documents.base import (
21
  DEFAULT_DOCUMENT_PROMPT,
22
  DEFAULT_DOCUMENT_SEPARATOR,
23
  DOCUMENTS_KEY,
24
- BaseCombineDocumentsChain,
25
  _validate_prompt,
26
  )
27
- from langchain.chains.llm import LLMChain
28
- from langchain_core.callbacks import Callbacks
29
- from langchain_core.documents import Document
30
-
31
-
32
- CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
33
-
34
  from langchain_core.runnables.config import RunnableConfig
35
- from langchain_core.messages import BaseMessage
36
-
37
-
38
- from langchain_core.output_parsers import StrOutputParser
39
  from langchain_core.prompts import ChatPromptTemplate
40
  from langchain_community.chat_models import ChatOpenAI
41
-
42
- from langchain.chains import RetrievalQA, ConversationalRetrievalChain
43
- from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
44
-
45
- from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
46
  from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
47
  import inspect
48
- from langchain.chains.conversational_retrieval.base import _get_chat_history
49
  from langchain_core.messages import BaseMessage
50
 
 
 
51
 
52
  class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
53
 
 
1
  from typing import Any, Dict, List, Union, Tuple, Optional
 
 
 
 
 
 
 
2
  from langchain_core.prompts.base import BasePromptTemplate, format_document
 
3
  from langchain_core.output_parsers import StrOutputParser
4
  from langchain_core.output_parsers.base import BaseOutputParser
5
  from langchain_core.retrievers import BaseRetriever, RetrieverOutput
6
  from langchain_core.language_models import LanguageModelLike
7
  from langchain_core.runnables import Runnable, RunnableBranch, RunnablePassthrough
8
  from langchain_core.runnables.history import RunnableWithMessageHistory
 
9
  from langchain_core.chat_history import BaseChatMessageHistory
10
  from langchain_core.pydantic_v1 import BaseModel, Field
11
  from langchain.chains.combine_documents.base import (
12
  DEFAULT_DOCUMENT_PROMPT,
13
  DEFAULT_DOCUMENT_SEPARATOR,
14
  DOCUMENTS_KEY,
 
15
  _validate_prompt,
16
  )
 
 
 
 
 
 
 
17
  from langchain_core.runnables.config import RunnableConfig
 
 
 
 
18
  from langchain_core.prompts import ChatPromptTemplate
19
  from langchain_community.chat_models import ChatOpenAI
20
+ from langchain.chains import ConversationalRetrievalChain
 
 
 
 
21
  from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
22
  import inspect
 
23
  from langchain_core.messages import BaseMessage
24
 
25
+ CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
26
+
27
 
28
  class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
29
 
code/modules/chat/llm_tutor.py CHANGED
@@ -3,7 +3,6 @@ from modules.chat.chat_model_loader import ChatModelLoader
3
  from modules.vectorstore.store_manager import VectorStoreManager
4
  from modules.retriever.retriever import Retriever
5
  from modules.chat.langchain.langchain_rag import (
6
- Langchain_RAG_V1,
7
  Langchain_RAG_V2,
8
  QuestionGenerator,
9
  )
@@ -28,9 +27,11 @@ class LLMTutor:
28
  self.rephrase_prompt = get_prompt(
29
  config, "rephrase"
30
  ) # Initialize rephrase_prompt
31
- if self.config["vectorstore"]["embedd_files"]:
32
- self.vector_db.create_database()
33
- self.vector_db.save_database()
 
 
34
 
35
  def update_llm(self, old_config, new_config):
36
  """
@@ -48,9 +49,11 @@ class LLMTutor:
48
  self.vector_db = VectorStoreManager(
49
  self.config, logger=self.logger
50
  ).load_database() # Reinitialize VectorStoreManager if vectorstore changes
51
- if self.config["vectorstore"]["embedd_files"]:
52
- self.vector_db.create_database()
53
- self.vector_db.save_database()
 
 
54
 
55
  if "llm_params.llm_style" in changes:
56
  self.qa_prompt = get_prompt(
 
3
  from modules.vectorstore.store_manager import VectorStoreManager
4
  from modules.retriever.retriever import Retriever
5
  from modules.chat.langchain.langchain_rag import (
 
6
  Langchain_RAG_V2,
7
  QuestionGenerator,
8
  )
 
27
  self.rephrase_prompt = get_prompt(
28
  config, "rephrase"
29
  ) # Initialize rephrase_prompt
30
+
31
+ # TODO: Removed this functionality for now, don't know if we need it
32
+ # if self.config["vectorstore"]["embedd_files"]:
33
+ # self.vector_db.create_database()
34
+ # self.vector_db.save_database()
35
 
36
  def update_llm(self, old_config, new_config):
37
  """
 
49
  self.vector_db = VectorStoreManager(
50
  self.config, logger=self.logger
51
  ).load_database() # Reinitialize VectorStoreManager if vectorstore changes
52
+
53
+ # TODO: Removed this functionality for now, don't know if we need it
54
+ # if self.config["vectorstore"]["embedd_files"]:
55
+ # self.vector_db.create_database()
56
+ # self.vector_db.save_database()
57
 
58
  if "llm_params.llm_style" in changes:
59
  self.qa_prompt = get_prompt(
code/modules/chat_processor/literal_ai.py CHANGED
@@ -1,44 +1,7 @@
1
- from chainlit.data import ChainlitDataLayer, queue_until_user_message
2
 
3
 
4
  # update custom methods here (Ref: https://github.com/Chainlit/chainlit/blob/4b533cd53173bcc24abe4341a7108f0070d60099/backend/chainlit/data/__init__.py)
5
  class CustomLiteralDataLayer(ChainlitDataLayer):
6
  def __init__(self, **kwargs):
7
  super().__init__(**kwargs)
8
-
9
- @queue_until_user_message()
10
- async def create_step(self, step_dict: "StepDict"):
11
- metadata = dict(
12
- step_dict.get("metadata", {}),
13
- **{
14
- "waitForAnswer": step_dict.get("waitForAnswer"),
15
- "language": step_dict.get("language"),
16
- "showInput": step_dict.get("showInput"),
17
- },
18
- )
19
-
20
- step: LiteralStepDict = {
21
- "createdAt": step_dict.get("createdAt"),
22
- "startTime": step_dict.get("start"),
23
- "endTime": step_dict.get("end"),
24
- "generation": step_dict.get("generation"),
25
- "id": step_dict.get("id"),
26
- "parentId": step_dict.get("parentId"),
27
- "name": step_dict.get("name"),
28
- "threadId": step_dict.get("threadId"),
29
- "type": step_dict.get("type"),
30
- "tags": step_dict.get("tags"),
31
- "metadata": metadata,
32
- }
33
- if step_dict.get("input"):
34
- step["input"] = {"content": step_dict.get("input")}
35
- if step_dict.get("output"):
36
- step["output"] = {"content": step_dict.get("output")}
37
- if step_dict.get("isError"):
38
- step["error"] = step_dict.get("output")
39
-
40
- # print("\n\n\n")
41
- # print("Step: ", step)
42
- # print("\n\n\n")
43
-
44
- await self.client.api.send_steps([step])
 
1
+ from chainlit.data import ChainlitDataLayer
2
 
3
 
4
  # update custom methods here (Ref: https://github.com/Chainlit/chainlit/blob/4b533cd53173bcc24abe4341a7108f0070d60099/backend/chainlit/data/__init__.py)
5
  class CustomLiteralDataLayer(ChainlitDataLayer):
6
  def __init__(self, **kwargs):
7
  super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/modules/config/config.yml CHANGED
@@ -4,7 +4,7 @@ device: 'cpu' # str [cuda, cpu]
4
 
5
  vectorstore:
6
  load_from_HF: True # bool
7
- embedd_files: False # bool
8
  data_path: '../storage/data' # str
9
  url_file_path: '../storage/data/urls.txt' # str
10
  expand_urls: True # bool
@@ -37,7 +37,6 @@ llm_params:
37
  temperature: 0.7 # float
38
  repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
39
  filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
40
- pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
41
  stream: False # bool
42
  pdf_reader: 'gpt' # str [llama, pymupdf, gpt]
43
 
 
4
 
5
  vectorstore:
6
  load_from_HF: True # bool
7
+ reparse_files: True # bool
8
  data_path: '../storage/data' # str
9
  url_file_path: '../storage/data/urls.txt' # str
10
  expand_urls: True # bool
 
37
  temperature: 0.7 # float
38
  repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
39
  filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
 
40
  stream: False # bool
41
  pdf_reader: 'gpt' # str [llama, pymupdf, gpt]
42
 
code/modules/config/constants.py CHANGED
@@ -3,6 +3,8 @@ import os
3
 
4
  load_dotenv()
5
 
 
 
6
  # API Keys - Loaded from the .env file
7
 
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
@@ -14,10 +16,11 @@ LITERAL_API_URL = os.getenv("LITERAL_API_URL")
14
  OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID")
15
  OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET")
16
 
17
- opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
 
 
 
18
 
19
  # Model Paths
20
 
21
  LLAMA_PATH = "../storage/models/tinyllama"
22
-
23
- RETRIEVER_HF_PATHS = {"RAGatouille": "XThomasBU/Colbert_Index"}
 
3
 
4
  load_dotenv()
5
 
6
+ TIMEOUT = 60
7
+
8
  # API Keys - Loaded from the .env file
9
 
10
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
16
  OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID")
17
  OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET")
18
 
19
+ opening_message = "Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
20
+ chat_end_message = (
21
+ "I hope I was able to help you. If you have any more questions, feel free to ask!"
22
+ )
23
 
24
  # Model Paths
25
 
26
  LLAMA_PATH = "../storage/models/tinyllama"
 
 
code/modules/config/user_config.yml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ retriever:
2
+ retriever_hf_paths:
3
+ RAGatouille: "XThomasBU/Colbert_Index"
code/modules/dataloader/data_loader.py CHANGED
@@ -3,40 +3,26 @@ import re
3
  import requests
4
  import pysrt
5
  from langchain_community.document_loaders import (
6
- PyMuPDFLoader,
7
  Docx2txtLoader,
8
  YoutubeLoader,
9
- WebBaseLoader,
10
  TextLoader,
11
  )
12
- from langchain_community.document_loaders import UnstructuredMarkdownLoader
13
- from llama_parse import LlamaParse
14
  from langchain.schema import Document
15
  import logging
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
  from langchain_experimental.text_splitter import SemanticChunker
18
  from langchain_openai.embeddings import OpenAIEmbeddings
19
- from ragatouille import RAGPretrainedModel
20
- from langchain.chains import LLMChain
21
- from langchain_community.llms import OpenAI
22
- from langchain import PromptTemplate
23
  import json
24
  from concurrent.futures import ThreadPoolExecutor
25
  from urllib.parse import urljoin
26
  import html2text
27
  import bs4
28
- import tempfile
29
  import PyPDF2
30
  from modules.dataloader.pdf_readers.base import PDFReader
31
  from modules.dataloader.pdf_readers.llama import LlamaParser
32
  from modules.dataloader.pdf_readers.gpt import GPTParser
33
-
34
- try:
35
- from modules.dataloader.helpers import get_metadata, download_pdf_from_url
36
- from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
37
- except:
38
- from dataloader.helpers import get_metadata, download_pdf_from_url
39
- from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
40
 
41
  logger = logging.getLogger(__name__)
42
  BASE_DIR = os.getcwd()
@@ -47,7 +33,7 @@ class HTMLReader:
47
  pass
48
 
49
  def read_url(self, url):
50
- response = requests.get(url)
51
  if response.status_code == 200:
52
  return response.text
53
  else:
@@ -65,11 +51,13 @@ class HTMLReader:
65
  href = href.replace("http", "https")
66
 
67
  absolute_url = urljoin(base_url, href)
68
- link['href'] = absolute_url
69
 
70
- resp = requests.head(absolute_url)
71
  if resp.status_code != 200:
72
- logger.warning(f"Link {absolute_url} is broken. Status code: {resp.status_code}")
 
 
73
 
74
  return str(soup)
75
 
@@ -85,6 +73,7 @@ class HTMLReader:
85
  else:
86
  return None
87
 
 
88
  class FileReader:
89
  def __init__(self, logger, kind):
90
  self.logger = logger
@@ -96,7 +85,9 @@ class FileReader:
96
  else:
97
  self.pdf_reader = PDFReader()
98
  self.web_reader = HTMLReader()
99
- self.logger.info(f"Initialized FileReader with {kind} PDF reader and HTML reader")
 
 
100
 
101
  def extract_text_from_pdf(self, pdf_path):
102
  text = ""
@@ -137,7 +128,7 @@ class FileReader:
137
  return [Document(page_content=self.web_reader.read_html(url))]
138
 
139
  def read_tex_from_url(self, tex_url):
140
- response = requests.get(tex_url)
141
  if response.status_code == 200:
142
  return [Document(page_content=response.text)]
143
  else:
@@ -154,17 +145,20 @@ class ChunkProcessor:
154
  self.document_metadata = {}
155
  self.document_chunks_full = []
156
 
157
- if not config['vectorstore']['embedd_files']:
 
158
  self.load_document_data()
159
 
160
  if config["splitter_options"]["use_splitter"]:
161
  if config["splitter_options"]["chunking_mode"] == "fixed":
162
  if config["splitter_options"]["split_by_token"]:
163
- self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
164
- chunk_size=config["splitter_options"]["chunk_size"],
165
- chunk_overlap=config["splitter_options"]["chunk_overlap"],
166
- separators=config["splitter_options"]["chunk_separators"],
167
- disallowed_special=(),
 
 
168
  )
169
  else:
170
  self.splitter = RecursiveCharacterTextSplitter(
@@ -175,8 +169,7 @@ class ChunkProcessor:
175
  )
176
  else:
177
  self.splitter = SemanticChunker(
178
- OpenAIEmbeddings(),
179
- breakpoint_threshold_type="percentile"
180
  )
181
 
182
  else:
@@ -203,7 +196,10 @@ class ChunkProcessor:
203
  ):
204
  # TODO: Clear up this pipeline of re-adding metadata
205
  documents = [Document(page_content=documents, source=source, page=page)]
206
- if file_type == "pdf" and self.config["splitter_options"]["chunking_mode"] == "fixed":
 
 
 
207
  document_chunks = documents
208
  else:
209
  document_chunks = self.splitter.split_documents(documents)
@@ -229,6 +225,20 @@ class ChunkProcessor:
229
  "https://dl4ds.github.io/sp2024/lectures/",
230
  "https://dl4ds.github.io/sp2024/schedule/",
231
  ) # For any additional metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  with ThreadPoolExecutor() as executor:
233
  executor.map(
234
  self.process_file,
@@ -298,6 +308,7 @@ class ChunkProcessor:
298
  self.document_metadata[file_path] = file_metadata
299
 
300
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
 
301
  file_name = os.path.basename(file_path)
302
 
303
  file_type = file_name.split(".")[-1]
@@ -314,10 +325,13 @@ class ChunkProcessor:
314
  return
315
 
316
  try:
317
-
318
  if file_path in self.document_data:
319
  self.logger.warning(f"File {file_name} already processed")
320
- documents = [Document(page_content=content) for content in self.document_data[file_path].values()]
 
 
 
321
  else:
322
  documents = read_methods[file_type](file_path)
323
 
@@ -370,22 +384,31 @@ class ChunkProcessor:
370
  json.dump(self.document_metadata, json_file, indent=4)
371
 
372
  def load_document_data(self):
373
- with open(
374
- f"{self.config['log_chunk_dir']}/docs/doc_content.json", "r"
375
- ) as json_file:
376
- self.document_data = json.load(json_file)
377
- with open(
378
- f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
379
- ) as json_file:
380
- self.document_metadata = json.load(json_file)
381
- self.logger.info(
382
- f"Loaded document content from {self.config['log_chunk_dir']}/docs/doc_content.json. Total documents: {len(self.document_data)}"
383
- )
 
 
 
 
 
 
 
384
 
385
 
386
  class DataLoader:
387
  def __init__(self, config, logger=None):
388
- self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
 
 
389
  self.chunk_processor = ChunkProcessor(config, logger=logger)
390
 
391
  def get_chunks(self, uploaded_files, weblinks):
@@ -403,19 +426,22 @@ if __name__ == "__main__":
403
  with open("../code/modules/config/config.yml", "r") as f:
404
  config = yaml.safe_load(f)
405
 
406
- STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
407
  uploaded_files = [
408
- os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
 
 
409
  ]
410
 
411
  data_loader = DataLoader(config, logger=logger)
412
  document_chunks, document_names, documents, document_metadata = (
413
  data_loader.get_chunks(
414
- ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
 
 
415
  [],
416
  )
417
  )
418
 
419
  print(document_names[:5])
420
  print(len(document_chunks))
421
-
 
3
  import requests
4
  import pysrt
5
  from langchain_community.document_loaders import (
 
6
  Docx2txtLoader,
7
  YoutubeLoader,
 
8
  TextLoader,
9
  )
 
 
10
  from langchain.schema import Document
11
  import logging
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain_experimental.text_splitter import SemanticChunker
14
  from langchain_openai.embeddings import OpenAIEmbeddings
 
 
 
 
15
  import json
16
  from concurrent.futures import ThreadPoolExecutor
17
  from urllib.parse import urljoin
18
  import html2text
19
  import bs4
 
20
  import PyPDF2
21
  from modules.dataloader.pdf_readers.base import PDFReader
22
  from modules.dataloader.pdf_readers.llama import LlamaParser
23
  from modules.dataloader.pdf_readers.gpt import GPTParser
24
+ from modules.dataloader.helpers import get_metadata
25
+ from modules.config.constants import TIMEOUT
 
 
 
 
 
26
 
27
  logger = logging.getLogger(__name__)
28
  BASE_DIR = os.getcwd()
 
33
  pass
34
 
35
  def read_url(self, url):
36
+ response = requests.get(url, timeout=TIMEOUT)
37
  if response.status_code == 200:
38
  return response.text
39
  else:
 
51
  href = href.replace("http", "https")
52
 
53
  absolute_url = urljoin(base_url, href)
54
+ link["href"] = absolute_url
55
 
56
+ resp = requests.head(absolute_url, timeout=TIMEOUT)
57
  if resp.status_code != 200:
58
+ logger.warning(
59
+ f"Link {absolute_url} is broken. Status code: {resp.status_code}"
60
+ )
61
 
62
  return str(soup)
63
 
 
73
  else:
74
  return None
75
 
76
+
77
  class FileReader:
78
  def __init__(self, logger, kind):
79
  self.logger = logger
 
85
  else:
86
  self.pdf_reader = PDFReader()
87
  self.web_reader = HTMLReader()
88
+ self.logger.info(
89
+ f"Initialized FileReader with {kind} PDF reader and HTML reader"
90
+ )
91
 
92
  def extract_text_from_pdf(self, pdf_path):
93
  text = ""
 
128
  return [Document(page_content=self.web_reader.read_html(url))]
129
 
130
  def read_tex_from_url(self, tex_url):
131
+ response = requests.get(tex_url, timeout=TIMEOUT)
132
  if response.status_code == 200:
133
  return [Document(page_content=response.text)]
134
  else:
 
145
  self.document_metadata = {}
146
  self.document_chunks_full = []
147
 
148
+ # TODO: Fix when reparse_files is False
149
+ if not config["vectorstore"]["reparse_files"]:
150
  self.load_document_data()
151
 
152
  if config["splitter_options"]["use_splitter"]:
153
  if config["splitter_options"]["chunking_mode"] == "fixed":
154
  if config["splitter_options"]["split_by_token"]:
155
+ self.splitter = (
156
+ RecursiveCharacterTextSplitter.from_tiktoken_encoder(
157
+ chunk_size=config["splitter_options"]["chunk_size"],
158
+ chunk_overlap=config["splitter_options"]["chunk_overlap"],
159
+ separators=config["splitter_options"]["chunk_separators"],
160
+ disallowed_special=(),
161
+ )
162
  )
163
  else:
164
  self.splitter = RecursiveCharacterTextSplitter(
 
169
  )
170
  else:
171
  self.splitter = SemanticChunker(
172
+ OpenAIEmbeddings(), breakpoint_threshold_type="percentile"
 
173
  )
174
 
175
  else:
 
196
  ):
197
  # TODO: Clear up this pipeline of re-adding metadata
198
  documents = [Document(page_content=documents, source=source, page=page)]
199
+ if (
200
+ file_type == "pdf"
201
+ and self.config["splitter_options"]["chunking_mode"] == "fixed"
202
+ ):
203
  document_chunks = documents
204
  else:
205
  document_chunks = self.splitter.split_documents(documents)
 
225
  "https://dl4ds.github.io/sp2024/lectures/",
226
  "https://dl4ds.github.io/sp2024/schedule/",
227
  ) # For any additional metadata
228
+
229
+ # remove already processed files if reparse_files is False
230
+ if not self.config["vectorstore"]["reparse_files"]:
231
+ total_documents = len(uploaded_files) + len(weblinks)
232
+ uploaded_files = [
233
+ file_path
234
+ for file_path in uploaded_files
235
+ if file_path not in self.document_data
236
+ ]
237
+ weblinks = [link for link in weblinks if link not in self.document_data]
238
+ print(
239
+ f"Total documents to process: {total_documents}, Documents already processed: {total_documents - len(uploaded_files) - len(weblinks)}"
240
+ )
241
+
242
  with ThreadPoolExecutor() as executor:
243
  executor.map(
244
  self.process_file,
 
308
  self.document_metadata[file_path] = file_metadata
309
 
310
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
311
+ print(f"Processing file {file_index + 1} : {file_path}")
312
  file_name = os.path.basename(file_path)
313
 
314
  file_type = file_name.split(".")[-1]
 
325
  return
326
 
327
  try:
328
+
329
  if file_path in self.document_data:
330
  self.logger.warning(f"File {file_name} already processed")
331
+ documents = [
332
+ Document(page_content=content)
333
+ for content in self.document_data[file_path].values()
334
+ ]
335
  else:
336
  documents = read_methods[file_type](file_path)
337
 
 
384
  json.dump(self.document_metadata, json_file, indent=4)
385
 
386
  def load_document_data(self):
387
+ try:
388
+ with open(
389
+ f"{self.config['log_chunk_dir']}/docs/doc_content.json", "r"
390
+ ) as json_file:
391
+ self.document_data = json.load(json_file)
392
+ with open(
393
+ f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
394
+ ) as json_file:
395
+ self.document_metadata = json.load(json_file)
396
+ self.logger.info(
397
+ f"Loaded document content from {self.config['log_chunk_dir']}/docs/doc_content.json. Total documents: {len(self.document_data)}"
398
+ )
399
+ except FileNotFoundError:
400
+ self.logger.warning(
401
+ f"Document content not found in {self.config['log_chunk_dir']}/docs/doc_content.json"
402
+ )
403
+ self.document_data = {}
404
+ self.document_metadata = {}
405
 
406
 
407
  class DataLoader:
408
  def __init__(self, config, logger=None):
409
+ self.file_reader = FileReader(
410
+ logger=logger, kind=config["llm_params"]["pdf_reader"]
411
+ )
412
  self.chunk_processor = ChunkProcessor(config, logger=logger)
413
 
414
  def get_chunks(self, uploaded_files, weblinks):
 
426
  with open("../code/modules/config/config.yml", "r") as f:
427
  config = yaml.safe_load(f)
428
 
429
+ STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
430
  uploaded_files = [
431
+ os.path.join(STORAGE_DIR, file)
432
+ for file in os.listdir(STORAGE_DIR)
433
+ if file != "urls.txt"
434
  ]
435
 
436
  data_loader = DataLoader(config, logger=logger)
437
  document_chunks, document_names, documents, document_metadata = (
438
  data_loader.get_chunks(
439
+ [
440
+ "https://dl4ds.github.io/fa2024/static_files/discussion_slides/00_discussion.pdf"
441
+ ],
442
  [],
443
  )
444
  )
445
 
446
  print(document_names[:5])
447
  print(len(document_chunks))
 
code/modules/dataloader/helpers.py CHANGED
@@ -2,6 +2,8 @@ import requests
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urlparse
4
  import tempfile
 
 
5
 
6
  def get_urls_from_file(file_path: str):
7
  """
@@ -26,11 +28,11 @@ def get_metadata(lectures_url, schedule_url):
26
  lecture_metadata = {}
27
 
28
  # Get the main lectures page content
29
- r_lectures = requests.get(lectures_url)
30
  soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
31
 
32
  # Get the main schedule page content
33
- r_schedule = requests.get(schedule_url)
34
  soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
35
 
36
  # Find all lecture blocks
@@ -118,7 +120,7 @@ def download_pdf_from_url(pdf_url):
118
  Returns:
119
  str: The local file path of the downloaded PDF file.
120
  """
121
- response = requests.get(pdf_url)
122
  if response.status_code == 200:
123
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
124
  temp_file.write(response.content)
 
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urlparse
4
  import tempfile
5
+ from modules.config.constants import TIMEOUT
6
+
7
 
8
  def get_urls_from_file(file_path: str):
9
  """
 
28
  lecture_metadata = {}
29
 
30
  # Get the main lectures page content
31
+ r_lectures = requests.get(lectures_url, timeout=TIMEOUT)
32
  soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
33
 
34
  # Get the main schedule page content
35
+ r_schedule = requests.get(schedule_url, timeout=TIMEOUT)
36
  soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
37
 
38
  # Find all lecture blocks
 
120
  Returns:
121
  str: The local file path of the downloaded PDF file.
122
  """
123
+ response = requests.get(pdf_url, timeout=TIMEOUT)
124
  if response.status_code == 200:
125
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
126
  temp_file.write(response.content)
code/modules/dataloader/pdf_readers/gpt.py CHANGED
@@ -6,6 +6,7 @@ from io import BytesIO
6
  from openai import OpenAI
7
  from pdf2image import convert_from_path
8
  from langchain.schema import Document
 
9
 
10
 
11
  class GPTParser:
@@ -19,9 +20,9 @@ class GPTParser:
19
  self.api_key = os.getenv("OPENAI_API_KEY")
20
  self.prompt = """
21
  The provided documents are images of PDFs of lecture slides of deep learning material.
22
- They contain LaTeX equations, images, and text.
23
  The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
24
- The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
25
  For images, give a description and if you can, a source. Separate each page with '---'.
26
  Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
27
  """
@@ -31,36 +32,45 @@ class GPTParser:
31
 
32
  encoded_images = [self.encode_image(image) for image in images]
33
 
34
- chunks = [encoded_images[i:i + 5] for i in range(0, len(encoded_images), 5)]
35
 
36
  headers = {
37
  "Content-Type": "application/json",
38
- "Authorization": f"Bearer {self.api_key}"
39
  }
40
 
41
  output = ""
42
  for chunk_num, chunk in enumerate(chunks):
43
- content = [{"type": "image_url", "image_url": {
44
- "url": f"data:image/jpeg;base64,{image}"}} for image in chunk]
 
 
 
 
 
45
 
46
  content.insert(0, {"type": "text", "text": self.prompt})
47
 
48
  payload = {
49
  "model": "gpt-4o-mini",
50
- "messages": [
51
- {
52
- "role": "user",
53
- "content": content
54
- }
55
- ],
56
  }
57
 
58
  response = requests.post(
59
- "https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
 
 
 
 
60
 
61
  resp = response.json()
62
 
63
- chunk_output = resp['choices'][0]['message']['content'].replace("```", "").replace("markdown", "").replace("````", "")
 
 
 
 
 
64
 
65
  output += chunk_output + "\n---\n"
66
 
@@ -68,14 +78,12 @@ class GPTParser:
68
  output = [doc for doc in output if doc.strip() != ""]
69
 
70
  documents = [
71
- Document(
72
- page_content=page,
73
- metadata={"source": pdf_path, "page": i}
74
- ) for i, page in enumerate(output)
75
  ]
76
  return documents
77
 
78
  def encode_image(self, image):
79
  buffered = BytesIO()
80
  image.save(buffered, format="JPEG")
81
- return base64.b64encode(buffered.getvalue()).decode('utf-8')
 
6
  from openai import OpenAI
7
  from pdf2image import convert_from_path
8
  from langchain.schema import Document
9
+ from modules.config.constants import TIMEOUT
10
 
11
 
12
  class GPTParser:
 
20
  self.api_key = os.getenv("OPENAI_API_KEY")
21
  self.prompt = """
22
  The provided documents are images of PDFs of lecture slides of deep learning material.
23
+ They contain LaTeX equations, images, and text.
24
  The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
25
+ The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
26
  For images, give a description and if you can, a source. Separate each page with '---'.
27
  Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
28
  """
 
32
 
33
  encoded_images = [self.encode_image(image) for image in images]
34
 
35
+ chunks = [encoded_images[i : i + 5] for i in range(0, len(encoded_images), 5)]
36
 
37
  headers = {
38
  "Content-Type": "application/json",
39
+ "Authorization": f"Bearer {self.api_key}",
40
  }
41
 
42
  output = ""
43
  for chunk_num, chunk in enumerate(chunks):
44
+ content = [
45
+ {
46
+ "type": "image_url",
47
+ "image_url": {"url": f"data:image/jpeg;base64,{image}"},
48
+ }
49
+ for image in chunk
50
+ ]
51
 
52
  content.insert(0, {"type": "text", "text": self.prompt})
53
 
54
  payload = {
55
  "model": "gpt-4o-mini",
56
+ "messages": [{"role": "user", "content": content}],
 
 
 
 
 
57
  }
58
 
59
  response = requests.post(
60
+ "https://api.openai.com/v1/chat/completions",
61
+ headers=headers,
62
+ json=payload,
63
+ timeout=TIMEOUT,
64
+ )
65
 
66
  resp = response.json()
67
 
68
+ chunk_output = (
69
+ resp["choices"][0]["message"]["content"]
70
+ .replace("```", "")
71
+ .replace("markdown", "")
72
+ .replace("````", "")
73
+ )
74
 
75
  output += chunk_output + "\n---\n"
76
 
 
78
  output = [doc for doc in output if doc.strip() != ""]
79
 
80
  documents = [
81
+ Document(page_content=page, metadata={"source": pdf_path, "page": i})
82
+ for i, page in enumerate(output)
 
 
83
  ]
84
  return documents
85
 
86
  def encode_image(self, image):
87
  buffered = BytesIO()
88
  image.save(buffered, format="JPEG")
89
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
code/modules/dataloader/pdf_readers/llama.py CHANGED
@@ -2,19 +2,18 @@ import os
2
  import requests
3
  from llama_parse import LlamaParse
4
  from langchain.schema import Document
5
- from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
6
  from modules.dataloader.helpers import download_pdf_from_url
7
 
8
 
9
-
10
  class LlamaParser:
11
  def __init__(self):
12
  self.GPT_API_KEY = OPENAI_API_KEY
13
  self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
14
  self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
15
  self.headers = {
16
- 'Accept': 'application/json',
17
- 'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
18
  }
19
  self.parser = LlamaParse(
20
  api_key=LLAMA_CLOUD_API_KEY,
@@ -23,7 +22,7 @@ class LlamaParser:
23
  language="en",
24
  gpt4o_mode=False,
25
  # gpt4o_api_key=OPENAI_API_KEY,
26
- parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
27
  )
28
 
29
  def parse(self, pdf_path):
@@ -38,10 +37,8 @@ class LlamaParser:
38
  pages = [page.strip() for page in pages]
39
 
40
  documents = [
41
- Document(
42
- page_content=page,
43
- metadata={"source": pdf_path, "page": i}
44
- ) for i, page in enumerate(pages)
45
  ]
46
 
47
  return documents
@@ -53,20 +50,30 @@ class LlamaParser:
53
  }
54
 
55
  files = [
56
- ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
 
 
 
 
 
 
 
57
  ]
58
 
59
  response = requests.request(
60
- "POST", self.parse_url, headers=self.headers, data=payload, files=files)
 
61
 
62
- return response.json()['id'], response.json()['status']
63
 
64
  async def get_result(self, job_id):
65
- url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
 
 
66
 
67
  response = requests.request("GET", url, headers=self.headers, data={})
68
 
69
- return response.json()['markdown']
70
 
71
  async def _parse(self, pdf_path):
72
  job_id, status = self.make_request(pdf_path)
@@ -78,15 +85,9 @@ class LlamaParser:
78
 
79
  result = await self.get_result(job_id)
80
 
81
- documents = [
82
- Document(
83
- page_content=result,
84
- metadata={"source": pdf_path}
85
- )
86
- ]
87
 
88
  return documents
89
 
90
- async def _parse(self, pdf_path):
91
- return await self._parse(pdf_path)
92
-
 
2
  import requests
3
  from llama_parse import LlamaParse
4
  from langchain.schema import Document
5
+ from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY, TIMEOUT
6
  from modules.dataloader.helpers import download_pdf_from_url
7
 
8
 
 
9
  class LlamaParser:
10
  def __init__(self):
11
  self.GPT_API_KEY = OPENAI_API_KEY
12
  self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
13
  self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
14
  self.headers = {
15
+ "Accept": "application/json",
16
+ "Authorization": f"Bearer {LLAMA_CLOUD_API_KEY}",
17
  }
18
  self.parser = LlamaParse(
19
  api_key=LLAMA_CLOUD_API_KEY,
 
22
  language="en",
23
  gpt4o_mode=False,
24
  # gpt4o_api_key=OPENAI_API_KEY,
25
+ parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source.",
26
  )
27
 
28
  def parse(self, pdf_path):
 
37
  pages = [page.strip() for page in pages]
38
 
39
  documents = [
40
+ Document(page_content=page, metadata={"source": pdf_path, "page": i})
41
+ for i, page in enumerate(pages)
 
 
42
  ]
43
 
44
  return documents
 
50
  }
51
 
52
  files = [
53
+ (
54
+ "file",
55
+ (
56
+ "file",
57
+ requests.get(pdf_url, timeout=TIMEOUT).content,
58
+ "application/octet-stream",
59
+ ),
60
+ )
61
  ]
62
 
63
  response = requests.request(
64
+ "POST", self.parse_url, headers=self.headers, data=payload, files=files
65
+ )
66
 
67
+ return response.json()["id"], response.json()["status"]
68
 
69
  async def get_result(self, job_id):
70
+ url = (
71
+ f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
72
+ )
73
 
74
  response = requests.request("GET", url, headers=self.headers, data={})
75
 
76
+ return response.json()["markdown"]
77
 
78
  async def _parse(self, pdf_path):
79
  job_id, status = self.make_request(pdf_path)
 
85
 
86
  result = await self.get_result(job_id)
87
 
88
+ documents = [Document(page_content=result, metadata={"source": pdf_path})]
 
 
 
 
 
89
 
90
  return documents
91
 
92
+ # async def _parse(self, pdf_path):
93
+ # return await self._parse(pdf_path)
 
code/modules/dataloader/webpage_crawler.py CHANGED
@@ -3,7 +3,9 @@ from aiohttp import ClientSession
3
  import asyncio
4
  import requests
5
  from bs4 import BeautifulSoup
6
- from urllib.parse import urlparse, urljoin, urldefrag
 
 
7
 
8
  class WebpageCrawler:
9
  def __init__(self):
@@ -18,7 +20,7 @@ class WebpageCrawler:
18
 
19
  def url_exists(self, url: str) -> bool:
20
  try:
21
- response = requests.head(url)
22
  return response.status_code == 200
23
  except requests.ConnectionError:
24
  return False
@@ -88,7 +90,7 @@ class WebpageCrawler:
88
 
89
  def is_webpage(self, url: str) -> bool:
90
  try:
91
- response = requests.head(url, allow_redirects=True)
92
  content_type = response.headers.get("Content-Type", "").lower()
93
  return "text/html" in content_type
94
  except requests.RequestException:
 
3
  import asyncio
4
  import requests
5
  from bs4 import BeautifulSoup
6
+ from urllib.parse import urljoin, urldefrag
7
+ from modules.config.constants import TIMEOUT
8
+
9
 
10
  class WebpageCrawler:
11
  def __init__(self):
 
20
 
21
  def url_exists(self, url: str) -> bool:
22
  try:
23
+ response = requests.head(url, timeout=TIMEOUT)
24
  return response.status_code == 200
25
  except requests.ConnectionError:
26
  return False
 
90
 
91
  def is_webpage(self, url: str) -> bool:
92
  try:
93
+ response = requests.head(url, allow_redirects=True, timeout=TIMEOUT)
94
  content_type = response.headers.get("Content-Type", "").lower()
95
  return "text/html" in content_type
96
  except requests.RequestException:
code/modules/vectorstore/colbert.py CHANGED
@@ -1,9 +1,9 @@
1
  from ragatouille import RAGPretrainedModel
2
  from modules.vectorstore.base import VectorStoreBase
3
  from langchain_core.retrievers import BaseRetriever
4
- from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun, Callbacks
5
  from langchain_core.documents import Document
6
- from typing import Any, List, Optional, Sequence
7
  import os
8
  import json
9
 
@@ -85,6 +85,7 @@ class ColbertVectorStore(VectorStoreBase):
85
  document_ids=document_names,
86
  document_metadatas=document_metadata,
87
  )
 
88
  self.colbert.set_document_count(len(document_names))
89
 
90
  def load_database(self):
 
1
  from ragatouille import RAGPretrainedModel
2
  from modules.vectorstore.base import VectorStoreBase
3
  from langchain_core.retrievers import BaseRetriever
4
+ from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
5
  from langchain_core.documents import Document
6
+ from typing import Any, List
7
  import os
8
  import json
9
 
 
85
  document_ids=document_names,
86
  document_metadatas=document_metadata,
87
  )
88
+ print(f"Index created at {index_path}")
89
  self.colbert.set_document_count(len(document_names))
90
 
91
  def load_database(self):
code/modules/vectorstore/embedding_model_loader.py CHANGED
@@ -1,9 +1,6 @@
1
  from langchain_community.embeddings import OpenAIEmbeddings
2
  from langchain_community.embeddings import HuggingFaceEmbeddings
3
- from langchain_community.embeddings import LlamaCppEmbeddings
4
-
5
- from modules.config.constants import *
6
- import os
7
 
8
 
9
  class EmbeddingModelLoader:
@@ -28,8 +25,5 @@ class EmbeddingModelLoader:
28
  "trust_remote_code": True,
29
  },
30
  )
31
- # embedding_model = LlamaCppEmbeddings(
32
- # model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
33
- # )
34
 
35
  return embedding_model
 
1
  from langchain_community.embeddings import OpenAIEmbeddings
2
  from langchain_community.embeddings import HuggingFaceEmbeddings
3
+ from modules.config.constants import OPENAI_API_KEY, HUGGINGFACE_TOKEN
 
 
 
4
 
5
 
6
  class EmbeddingModelLoader:
 
25
  "trust_remote_code": True,
26
  },
27
  )
 
 
 
28
 
29
  return embedding_model
code/modules/vectorstore/faiss.py CHANGED
@@ -14,10 +14,15 @@ class FaissVectorStore(VectorStoreBase):
14
  def __init__(self, config):
15
  self.config = config
16
  self._init_vector_db()
17
- self.local_path = os.path.join(self.config["vectorstore"]["db_path"],
18
- "db_" + self.config["vectorstore"]["db_option"]
19
- + "_" + self.config["vectorstore"]["model"]
20
- + "_" + config["splitter_options"]["chunking_mode"])
 
 
 
 
 
21
 
22
  def _init_vector_db(self):
23
  self.faiss = FAISS(
@@ -28,9 +33,7 @@ class FaissVectorStore(VectorStoreBase):
28
  self.vectorstore = self.faiss.from_documents(
29
  documents=document_chunks, embedding=embedding_model
30
  )
31
- self.vectorstore.save_local(
32
- self.local_path
33
- )
34
 
35
  def load_database(self, embedding_model):
36
  self.vectorstore = self.faiss.load_local(
 
14
  def __init__(self, config):
15
  self.config = config
16
  self._init_vector_db()
17
+ self.local_path = os.path.join(
18
+ self.config["vectorstore"]["db_path"],
19
+ "db_"
20
+ + self.config["vectorstore"]["db_option"]
21
+ + "_"
22
+ + self.config["vectorstore"]["model"]
23
+ + "_"
24
+ + config["splitter_options"]["chunking_mode"],
25
+ )
26
 
27
  def _init_vector_db(self):
28
  self.faiss = FAISS(
 
33
  self.vectorstore = self.faiss.from_documents(
34
  documents=document_chunks, embedding=embedding_model
35
  )
36
+ self.vectorstore.save_local(self.local_path)
 
 
37
 
38
  def load_database(self, embedding_model):
39
  self.vectorstore = self.faiss.load_local(
code/modules/vectorstore/raptor.py CHANGED
@@ -317,13 +317,10 @@ class RAPTORVectoreStore(VectorStoreBase):
317
  print(f"--Generated {len(all_clusters)} clusters--")
318
 
319
  # Summarization
320
- template = """Here is content from the course DS598: Deep Learning for Data Science.
321
-
322
  The content may be form webapge about the course, or lecture content, or any other relevant information.
323
  If the content is in bullet points (from pdf lectre slides), you can summarize the bullet points.
324
-
325
  Give a detailed summary of the content below.
326
-
327
  Documentation:
328
  {context}
329
  """
 
317
  print(f"--Generated {len(all_clusters)} clusters--")
318
 
319
  # Summarization
320
+ template = """Here is content from the course DS598: Deep Learning for Data Science.
 
321
  The content may be form webapge about the course, or lecture content, or any other relevant information.
322
  If the content is in bullet points (from pdf lectre slides), you can summarize the bullet points.
 
323
  Give a detailed summary of the content below.
 
324
  Documentation:
325
  {context}
326
  """
code/modules/vectorstore/store_manager.py CHANGED
@@ -1,9 +1,7 @@
1
  from modules.vectorstore.vectorstore import VectorStore
2
- from modules.vectorstore.helpers import *
3
  from modules.dataloader.webpage_crawler import WebpageCrawler
4
  from modules.dataloader.data_loader import DataLoader
5
- from modules.dataloader.helpers import *
6
- from modules.config.constants import RETRIEVER_HF_PATHS
7
  from modules.vectorstore.embedding_model_loader import EmbeddingModelLoader
8
  import logging
9
  import os
@@ -117,7 +115,7 @@ class VectorStoreManager:
117
  )
118
  num_documents = len(document_chunks)
119
  self.logger.info(f"Number of documents in the DB: {num_documents}")
120
- metadata_keys = list(document_metadata[0].keys())
121
  self.logger.info(f"Metadata keys: {metadata_keys}")
122
  self.logger.info("Completed loading data")
123
  self.initialize_database(
@@ -170,13 +168,21 @@ if __name__ == "__main__":
170
 
171
  with open("modules/config/config.yml", "r") as f:
172
  config = yaml.safe_load(f)
 
 
173
  print(config)
 
174
  print(f"Trying to create database with config: {config}")
175
  vector_db = VectorStoreManager(config)
176
  if config["vectorstore"]["load_from_HF"]:
177
- if config["vectorstore"]["db_option"] in RETRIEVER_HF_PATHS:
 
 
 
178
  vector_db.load_from_HF(
179
- HF_PATH=RETRIEVER_HF_PATHS[config["vectorstore"]["db_option"]]
 
 
180
  )
181
  else:
182
  # print(f"HF_PATH not available for {config['vectorstore']['db_option']}")
@@ -189,7 +195,7 @@ if __name__ == "__main__":
189
  vector_db.create_database()
190
  print("Created database")
191
 
192
- print(f"Trying to load the database")
193
  vector_db = VectorStoreManager(config)
194
  vector_db.load_database()
195
  print("Loaded database")
 
1
  from modules.vectorstore.vectorstore import VectorStore
2
+ from modules.dataloader.helpers import get_urls_from_file
3
  from modules.dataloader.webpage_crawler import WebpageCrawler
4
  from modules.dataloader.data_loader import DataLoader
 
 
5
  from modules.vectorstore.embedding_model_loader import EmbeddingModelLoader
6
  import logging
7
  import os
 
115
  )
116
  num_documents = len(document_chunks)
117
  self.logger.info(f"Number of documents in the DB: {num_documents}")
118
+ metadata_keys = list(document_metadata[0].keys()) if document_metadata else []
119
  self.logger.info(f"Metadata keys: {metadata_keys}")
120
  self.logger.info("Completed loading data")
121
  self.initialize_database(
 
168
 
169
  with open("modules/config/config.yml", "r") as f:
170
  config = yaml.safe_load(f)
171
+ with open("modules/config/user_config.yml", "r") as f:
172
+ user_config = yaml.safe_load(f)
173
  print(config)
174
+ print(user_config)
175
  print(f"Trying to create database with config: {config}")
176
  vector_db = VectorStoreManager(config)
177
  if config["vectorstore"]["load_from_HF"]:
178
+ if (
179
+ config["vectorstore"]["db_option"]
180
+ in user_config["retriever"]["retriever_hf_paths"]
181
+ ):
182
  vector_db.load_from_HF(
183
+ HF_PATH=user_config["retriever"]["retriever_hf_paths"][
184
+ config["vectorstore"]["db_option"]
185
+ ]
186
  )
187
  else:
188
  # print(f"HF_PATH not available for {config['vectorstore']['db_option']}")
 
195
  vector_db.create_database()
196
  print("Created database")
197
 
198
+ print("Trying to load the database")
199
  vector_db = VectorStoreManager(config)
200
  vector_db.load_database()
201
  print("Loaded database")
docs/README.md DELETED
@@ -1,51 +0,0 @@
1
- # Documentation
2
-
3
- ## File Structure:
4
- - `docs/` - Documentation files
5
- - `code/` - Code files
6
- - `storage/` - Storage files
7
- - `vectorstores/` - Vector Databases
8
- - `.env` - Environment Variables
9
- - `Dockerfile` - Dockerfile for Hugging Face
10
- - `.chainlit` - Chainlit Configuration
11
- - `chainlit.md` - Chainlit README
12
- - `README.md` - Repository README
13
- - `.gitignore` - Gitignore file
14
- - `requirements.txt` - Python Requirements
15
- - `.gitattributes` - Gitattributes file
16
-
17
- ## Code Structure
18
-
19
- - `code/main.py` - Main Chainlit App
20
- - `code/config.yaml` - Configuration File to set Embedding related, Vector Database related, and Chat Model related parameters.
21
- - `code/modules/vector_db.py` - Vector Database Creation
22
- - `code/modules/chat_model_loader.py` - Chat Model Loader (Creates the Chat Model)
23
- - `code/modules/constants.py` - Constants (Loads the Environment Variables, Prompts, Model Paths, etc.)
24
- - `code/modules/data_loader.py` - Loads and Chunks the Data
25
- - `code/modules/embedding_model.py` - Creates the Embedding Model to Embed the Data
26
- - `code/modules/llm_tutor.py` - Creates the RAG LLM Tutor
27
- - The Function `qa_bot()` loads the vector database and the chat model, and sets the prompt to pass to the chat model.
28
- - `code/modules/helpers.py` - Helper Functions
29
-
30
- ## Storage and Vectorstores
31
-
32
- - `storage/data/` - Data Storage (Put your pdf files under this directory, and urls in the urls.txt file)
33
- - `storage/models/` - Model Storage (Put your local LLMs under this directory)
34
-
35
- - `vectorstores/` - Vector Databases (Stores the Vector Databases generated from `code/modules/vector_db.py`)
36
-
37
-
38
- ## Useful Configurations
39
- set these in `code/config.yaml`:
40
- * ``["embedding_options"]["embedd_files"]`` - If set to True, embeds the files from the storage directory everytime you run the chainlit command. If set to False, uses the stored vector database.
41
- * ``["embedding_options"]["expand_urls"]`` - If set to True, gets and reads the data from all the links under the url provided. If set to False, only reads the data in the url provided.
42
- * ``["embedding_options"]["search_top_k"]`` - Number of sources that the retriever returns
43
- * ``["llm_params]["use_history"]`` - Whether to use history in the prompt or not
44
- * ``["llm_params]["memory_window"]`` - Number of interactions to keep a track of in the history
45
-
46
-
47
- ## LlamaCpp
48
- * https://python.langchain.com/docs/integrations/llms/llamacpp
49
-
50
- ## Hugging Face Models
51
- * Download the ``.gguf`` files for your Local LLM from Hugging Face (Example: https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/contribute.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 💡 **Please ensure formatting, linting, and security checks pass before submitting a pull request**
2
+
3
+ ## Code Formatting
4
+
5
+ The codebase is formatted using [black](https://github.com/psf/black)
6
+
7
+ To format the codebase, run the following command:
8
+
9
+ ```bash
10
+ black .
11
+ ```
12
+
13
+ Please ensure that the code is formatted before submitting a pull request.
14
+
15
+ ## Linting
16
+
17
+ The codebase is linted using [flake8](https://flake8.pycqa.org/en/latest/)
18
+
19
+ To view the linting errors, run the following command:
20
+
21
+ ```bash
22
+ flake8 .
23
+ ```
24
+
25
+ ## Security and Vulnerabilities
26
+
27
+ The codebase is scanned for security vulnerabilities using [bandit](https://github.com/PyCQA/bandit)
28
+
29
+ To scan the codebase for security vulnerabilities, run the following command:
30
+
31
+ ```bash
32
+ bandit -r .
33
+ ```
docs/setup.md ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Initial Setup
2
+
3
+ ⚠️ **Create the .env file inside the `code/` directory.**
4
+
5
+ ## Python Environment
6
+
7
+ Python Version: 3.11
8
+
9
+ Create a virtual environment and install the required packages:
10
+
11
+ ```bash
12
+ conda create -n ai_tutor python=3.11
13
+ conda activate ai_tutor
14
+ pip install -r requirements.txt
15
+ ```
16
+
17
+ ## Code Formatting
18
+
19
+ The codebase is formatted using [black](https://github.com/psf/black), and if making changes to the codebase, ensure that the code is formatted before submitting a pull request. More instructions can be found in `docs/contribute.md`.
20
+
21
+ ## Google OAuth 2.0 Client ID and Secret
22
+
23
+ To set up the Google OAuth 2.0 Client ID and Secret, follow these steps:
24
+
25
+ 1. Go to the [Google Cloud Console](https://console.cloud.google.com/apis/credentials).
26
+ 2. Create a new project or select an existing one.
27
+ 3. Navigate to the "Credentials" page.
28
+ 4. Click on "Create Credentials" and select "OAuth 2.0 Client ID".
29
+ 5. Configure the OAuth consent screen if you haven't already.
30
+ 6. Choose "Web application" as the application type.
31
+ 7. Configure the redirect URIs as needed.
32
+ 8. Copy the generated `Client ID` and `Client Secret`.
33
+
34
+ Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
35
+
36
+ ```bash
37
+ OAUTH_GOOGLE_CLIENT_ID=<your_client_id>
38
+ OAUTH_GOOGLE_CLIENT_SECRET=<your_client_secret>
39
+ ```
40
+
41
+ ## Literal AI API Key
42
+
43
+ To obtain the Literal AI API key:
44
+
45
+ 1. Sign up or log in to [Literal AI](https://cloud.getliteral.ai/).
46
+ 2. Navigate to the API Keys section under your account settings.
47
+ 3. Create a new API key if necessary and copy it.
48
+
49
+ Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
50
+
51
+ ```bash
52
+ LITERAL_API_KEY_LOGGING=<your_api_key>
53
+ LITERAL_API_URL=https://cloud.getliteral.ai
54
+ ```
55
+
56
+ ## LlamaCloud API Key
57
+
58
+ To obtain the LlamaCloud API Key:
59
+
60
+ 1. Go to [LlamaCloud](https://cloud.llamaindex.ai/).
61
+ 2. Sign up or log in to your account.
62
+ 3. Navigate to the API section and generate a new API key if necessary.
63
+
64
+ Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
65
+
66
+ ```bash
67
+ LLAMA_CLOUD_API_KEY=<your_api_key>
68
+ ```
69
+
70
+ ## Hugging Face Access Token
71
+
72
+ To obtain your Hugging Face access token:
73
+
74
+ 1. Go to [Hugging Face settings](https://huggingface.co/settings/tokens).
75
+ 2. Log in or create an account.
76
+ 3. Generate a new token or use an existing one.
77
+
78
+ Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
79
+
80
+ ```bash
81
+ HUGGINGFACE_TOKEN=<your-huggingface-token>
82
+ ```
83
+
84
+ ## Chainlit Authentication Secret
85
+
86
+ You must provide a JWT secret in the environment to use authentication. Run `chainlit create-secret` to generate one.
87
+
88
+ ```bash
89
+ chainlit create-secret
90
+ ```
91
+
92
+ Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
93
+
94
+ ```bash
95
+ CHAINLIT_AUTH_SECRET=<your_jwt_secret>
96
+ CHAINLIT_URL=<your_chainlit_url> # Example: CHAINLIT_URL=http://localhost:8000
97
+ ```
98
+
99
+ ## OpenAI API Key
100
+
101
+ Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
102
+
103
+ ```bash
104
+ OPENAI_API_KEY=<your_openai_api_key>
105
+ ```
106
+
107
+ ## In a Nutshell
108
+
109
+ Your .env file (secrets in HuggingFace) should look like this:
110
+
111
+ ```bash
112
+ CHAINLIT_AUTH_SECRET=<your_jwt_secret>
113
+ OPENAI_API_KEY=<your_openai_api_key>
114
+ HUGGINGFACE_TOKEN=<your-huggingface-token>
115
+ LITERAL_API_KEY_LOGGING=<your_api_key>
116
+ LITERAL_API_URL=<https://cloud.getliteral.ai>
117
+ OAUTH_GOOGLE_CLIENT_ID=<your_client_id>
118
+ OAUTH_GOOGLE_CLIENT_SECRET=<your_client_secret>
119
+ LLAMA_CLOUD_API_KEY=<your_api_key>
120
+ CHAINLIT_URL=<your_chainlit_url>
121
+ ```
122
+
123
+
124
+ # Configuration
125
+
126
+ The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
127
+ The configuration file `code/modules/user_config.yaml` contains user-defined parameters.
pyproject.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [tool.black]
2
+ line-length = 88
requirements.txt CHANGED
@@ -27,3 +27,6 @@ langchain_experimental
27
  html2text
28
  PyPDF2
29
  pdf2image
 
 
 
 
27
  html2text
28
  PyPDF2
29
  pdf2image
30
+ black
31
+ flake8
32
+ bandit