Spaces:
Build error
Build error
XThomasBU
commited on
Commit
•
6d056d5
1
Parent(s):
4dc8546
updates, added metadat to prompt
Browse files- Dockerfile +3 -5
- Dockerfile.dev +27 -0
- code/config.yml +4 -4
- code/main.py +9 -5
- code/modules/constants.py +2 -0
- code/modules/data_loader.py +88 -91
- code/modules/embedding_model_loader.py +6 -2
- code/modules/helpers.py +119 -87
- code/modules/llm_tutor.py +93 -11
- code/modules/vector_db.py +10 -15
- public/test.css +13 -0
- requirements.txt +19 -19
Dockerfile
CHANGED
@@ -1,14 +1,12 @@
|
|
1 |
-
FROM python:3.
|
2 |
|
3 |
WORKDIR /code
|
4 |
|
5 |
COPY ./requirements.txt /code/requirements.txt
|
6 |
|
7 |
-
RUN pip install --
|
8 |
|
9 |
-
RUN pip install --no-cache-dir
|
10 |
-
|
11 |
-
RUN pip install --upgrade --force-reinstall --no-cache-dir llama-cpp-python==0.2.32
|
12 |
|
13 |
COPY . /code
|
14 |
|
|
|
1 |
+
FROM python:3.11
|
2 |
|
3 |
WORKDIR /code
|
4 |
|
5 |
COPY ./requirements.txt /code/requirements.txt
|
6 |
|
7 |
+
RUN pip install --upgrade pip
|
8 |
|
9 |
+
RUN pip install --no-cache-dir -r /code/requirements.txt
|
|
|
|
|
10 |
|
11 |
COPY . /code
|
12 |
|
Dockerfile.dev
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --upgrade pip
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir -r /code/requirements.txt
|
10 |
+
|
11 |
+
COPY . /code
|
12 |
+
|
13 |
+
RUN ls -R
|
14 |
+
|
15 |
+
# Change permissions to allow writing to the directory
|
16 |
+
RUN chmod -R 777 /code
|
17 |
+
|
18 |
+
# Create a logs directory and set permissions
|
19 |
+
RUN mkdir /code/logs && chmod 777 /code/logs
|
20 |
+
|
21 |
+
# Create a cache directory within the application's working directory
|
22 |
+
RUN mkdir /.cache && chmod -R 777 /.cache
|
23 |
+
|
24 |
+
# Expose the port the app runs on
|
25 |
+
EXPOSE 8051
|
26 |
+
|
27 |
+
CMD python code/modules/vector_db.py && chainlit run code/main.py --port 8051
|
code/config.yml
CHANGED
@@ -2,18 +2,18 @@ embedding_options:
|
|
2 |
embedd_files: False # bool
|
3 |
data_path: 'storage/data' # str
|
4 |
url_file_path: 'storage/data/urls.txt' # str
|
5 |
-
expand_urls:
|
6 |
db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille]
|
7 |
db_path : 'vectorstores' # str
|
8 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
9 |
search_top_k : 3 # int
|
10 |
score_threshold : 0.2 # float
|
11 |
llm_params:
|
12 |
-
use_history:
|
13 |
memory_window: 3 # int
|
14 |
-
llm_loader: '
|
15 |
openai_params:
|
16 |
-
model: 'gpt-
|
17 |
local_llm_params:
|
18 |
model: "storage/models/llama-2-7b-chat.Q4_0.gguf"
|
19 |
model_type: "llama"
|
|
|
2 |
embedd_files: False # bool
|
3 |
data_path: 'storage/data' # str
|
4 |
url_file_path: 'storage/data/urls.txt' # str
|
5 |
+
expand_urls: True # bool
|
6 |
db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille]
|
7 |
db_path : 'vectorstores' # str
|
8 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
9 |
search_top_k : 3 # int
|
10 |
score_threshold : 0.2 # float
|
11 |
llm_params:
|
12 |
+
use_history: True # bool
|
13 |
memory_window: 3 # int
|
14 |
+
llm_loader: 'openai' # str [local_llm, openai]
|
15 |
openai_params:
|
16 |
+
model: 'gpt-3.5-turbo-1106' # str [gpt-3.5-turbo-1106, gpt-4]
|
17 |
local_llm_params:
|
18 |
model: "storage/models/llama-2-7b-chat.Q4_0.gguf"
|
19 |
model_type: "llama"
|
code/main.py
CHANGED
@@ -38,10 +38,6 @@ logger.addHandler(file_handler)
|
|
38 |
@cl.set_chat_profiles
|
39 |
async def chat_profile():
|
40 |
return [
|
41 |
-
cl.ChatProfile(
|
42 |
-
name="Llama",
|
43 |
-
markdown_description="Use the local LLM: **Tiny Llama**.",
|
44 |
-
),
|
45 |
# cl.ChatProfile(
|
46 |
# name="Mistral",
|
47 |
# markdown_description="Use the local LLM: **Mistral**.",
|
@@ -54,6 +50,10 @@ async def chat_profile():
|
|
54 |
name="gpt-4",
|
55 |
markdown_description="Use OpenAI API for **gpt-4**.",
|
56 |
),
|
|
|
|
|
|
|
|
|
57 |
]
|
58 |
|
59 |
|
@@ -96,7 +96,7 @@ async def start():
|
|
96 |
model = config["llm_params"]["local_llm_params"]["model"]
|
97 |
msg = cl.Message(content=f"Starting the bot {model}...")
|
98 |
await msg.send()
|
99 |
-
msg.content =
|
100 |
await msg.update()
|
101 |
|
102 |
cl.user_session.set("chain", chain)
|
@@ -119,6 +119,10 @@ async def main(message):
|
|
119 |
answer = res["result"]
|
120 |
print(f"answer: {answer}")
|
121 |
|
|
|
|
|
|
|
|
|
122 |
answer_with_sources, source_elements = get_sources(res, answer)
|
123 |
|
124 |
await cl.Message(content=answer_with_sources, elements=source_elements).send()
|
|
|
38 |
@cl.set_chat_profiles
|
39 |
async def chat_profile():
|
40 |
return [
|
|
|
|
|
|
|
|
|
41 |
# cl.ChatProfile(
|
42 |
# name="Mistral",
|
43 |
# markdown_description="Use the local LLM: **Mistral**.",
|
|
|
50 |
name="gpt-4",
|
51 |
markdown_description="Use OpenAI API for **gpt-4**.",
|
52 |
),
|
53 |
+
cl.ChatProfile(
|
54 |
+
name="Llama",
|
55 |
+
markdown_description="Use the local LLM: **Tiny Llama**.",
|
56 |
+
),
|
57 |
]
|
58 |
|
59 |
|
|
|
96 |
model = config["llm_params"]["local_llm_params"]["model"]
|
97 |
msg = cl.Message(content=f"Starting the bot {model}...")
|
98 |
await msg.send()
|
99 |
+
msg.content = opening_message
|
100 |
await msg.update()
|
101 |
|
102 |
cl.user_session.set("chain", chain)
|
|
|
119 |
answer = res["result"]
|
120 |
print(f"answer: {answer}")
|
121 |
|
122 |
+
logger.info(f"Question: {res['question']}")
|
123 |
+
logger.info(f"History: {res['chat_history']}")
|
124 |
+
logger.info(f"Answer: {answer}\n")
|
125 |
+
|
126 |
answer_with_sources, source_elements = get_sources(res, answer)
|
127 |
|
128 |
await cl.Message(content=answer_with_sources, elements=source_elements).send()
|
code/modules/constants.py
CHANGED
@@ -6,7 +6,9 @@ load_dotenv()
|
|
6 |
# API Keys - Loaded from the .env file
|
7 |
|
8 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
9 |
|
|
|
10 |
|
11 |
# Prompt Templates
|
12 |
|
|
|
6 |
# API Keys - Loaded from the .env file
|
7 |
|
8 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
9 |
+
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
10 |
|
11 |
+
opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
|
12 |
|
13 |
# Prompt Templates
|
14 |
|
code/modules/data_loader.py
CHANGED
@@ -14,17 +14,15 @@ from llama_parse import LlamaParse
|
|
14 |
from langchain.schema import Document
|
15 |
import logging
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
-
from langchain_experimental.text_splitter import SemanticChunker
|
18 |
-
from langchain_openai.embeddings import OpenAIEmbeddings
|
19 |
from ragatouille import RAGPretrainedModel
|
20 |
from langchain.chains import LLMChain
|
21 |
from langchain.llms import OpenAI
|
22 |
from langchain import PromptTemplate
|
23 |
|
24 |
try:
|
25 |
-
from modules.helpers import
|
26 |
except:
|
27 |
-
from helpers import
|
28 |
|
29 |
logger = logging.getLogger(__name__)
|
30 |
|
@@ -96,6 +94,14 @@ class FileReader:
|
|
96 |
loader = WebBaseLoader(url)
|
97 |
return loader.load()
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
class ChunkProcessor:
|
101 |
def __init__(self, config):
|
@@ -120,17 +126,6 @@ class ChunkProcessor:
|
|
120 |
self.splitter = None
|
121 |
logger.info("ChunkProcessor instance created")
|
122 |
|
123 |
-
# def extract_metadata(self, document_content):
|
124 |
-
|
125 |
-
# llm = OpenAI()
|
126 |
-
# prompt_template = PromptTemplate(
|
127 |
-
# input_variables=["document_content"],
|
128 |
-
# template="Extract metadata for this document:\n\n{document_content}\n\nMetadata:",
|
129 |
-
# )
|
130 |
-
# chain = LLMChain(llm=llm, prompt=prompt_template)
|
131 |
-
# metadata = chain.run(document_content=document_content)
|
132 |
-
# return metadata
|
133 |
-
|
134 |
def remove_delimiters(self, document_chunks: list):
|
135 |
for chunk in document_chunks:
|
136 |
for delimiter in self.config["splitter_options"]["delimiters_to_remove"]:
|
@@ -151,7 +146,12 @@ class ChunkProcessor:
|
|
151 |
self, documents, file_type="txt", source="", page=0, metadata={}
|
152 |
):
|
153 |
documents = [Document(page_content=documents, source=source, page=page)]
|
154 |
-
if
|
|
|
|
|
|
|
|
|
|
|
155 |
document_chunks = self.splitter.split_documents(documents)
|
156 |
elif file_type == "pdf":
|
157 |
document_chunks = documents # Full page for now
|
@@ -179,58 +179,54 @@ class ChunkProcessor:
|
|
179 |
self.documents = []
|
180 |
self.document_metadata = []
|
181 |
|
182 |
-
|
183 |
"https://dl4ds.github.io/sp2024/lectures/",
|
184 |
"https://dl4ds.github.io/sp2024/schedule/",
|
185 |
-
) #
|
186 |
|
187 |
for file_index, file_path in enumerate(uploaded_files):
|
188 |
file_name = os.path.basename(file_path)
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
# full_text = ""
|
205 |
-
# for doc in documents:
|
206 |
-
# full_text += doc.page_content
|
207 |
-
# break # getting only first page for now
|
208 |
-
|
209 |
-
# extracted_metadata = self.extract_metadata(full_text)
|
210 |
-
|
211 |
-
for doc in documents:
|
212 |
-
page_num = doc.metadata.get("page", 0)
|
213 |
-
self.documents.append(doc.page_content)
|
214 |
-
self.document_metadata.append({"source": file_path, "page": page_num})
|
215 |
-
if "lecture" in file_path.lower():
|
216 |
-
metadata = lecture_metadata.get(file_path, {})
|
217 |
-
metadata["source_type"] = "lecture"
|
218 |
-
self.document_metadata[-1].update(metadata)
|
219 |
else:
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
file_type,
|
229 |
-
source=file_path,
|
230 |
-
page=page_num,
|
231 |
-
metadata=metadata,
|
232 |
)
|
233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
# except Exception as e:
|
236 |
# logger.error(f"Error processing file {file_name}: {str(e)}")
|
@@ -252,37 +248,38 @@ class ChunkProcessor:
|
|
252 |
logger.info(f"Splitting weblinks: total of {len(weblinks)}")
|
253 |
|
254 |
for link_index, link in enumerate(weblinks):
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
)
|
268 |
-
self.child_document_names.append(f"{link}")
|
269 |
-
|
270 |
-
self.parent_document_names.append(link)
|
271 |
-
if self.config["embedding_options"]["db_option"] not in [
|
272 |
-
"RAGatouille"
|
273 |
-
]:
|
274 |
-
document_chunks = self.process_chunks(
|
275 |
-
self.documents[-1],
|
276 |
-
"txt",
|
277 |
-
source=link,
|
278 |
-
page=0,
|
279 |
-
metadata={"source_type": "webpage"},
|
280 |
-
)
|
281 |
-
self.document_chunks_full.extend(document_chunks)
|
282 |
-
except Exception as e:
|
283 |
-
logger.error(
|
284 |
-
f"Error splitting link {link_index+1} : {link}: {str(e)}"
|
285 |
-
)
|
286 |
|
287 |
|
288 |
class DataLoader:
|
|
|
14 |
from langchain.schema import Document
|
15 |
import logging
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
|
17 |
from ragatouille import RAGPretrainedModel
|
18 |
from langchain.chains import LLMChain
|
19 |
from langchain.llms import OpenAI
|
20 |
from langchain import PromptTemplate
|
21 |
|
22 |
try:
|
23 |
+
from modules.helpers import get_metadata
|
24 |
except:
|
25 |
+
from helpers import get_metadata
|
26 |
|
27 |
logger = logging.getLogger(__name__)
|
28 |
|
|
|
94 |
loader = WebBaseLoader(url)
|
95 |
return loader.load()
|
96 |
|
97 |
+
def read_tex_from_url(self, tex_url):
|
98 |
+
response = requests.get(tex_url)
|
99 |
+
if response.status_code == 200:
|
100 |
+
return [Document(page_content=response.text)]
|
101 |
+
else:
|
102 |
+
print("Failed to fetch .tex file from URL:", tex_url)
|
103 |
+
return None
|
104 |
+
|
105 |
|
106 |
class ChunkProcessor:
|
107 |
def __init__(self, config):
|
|
|
126 |
self.splitter = None
|
127 |
logger.info("ChunkProcessor instance created")
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
def remove_delimiters(self, document_chunks: list):
|
130 |
for chunk in document_chunks:
|
131 |
for delimiter in self.config["splitter_options"]["delimiters_to_remove"]:
|
|
|
146 |
self, documents, file_type="txt", source="", page=0, metadata={}
|
147 |
):
|
148 |
documents = [Document(page_content=documents, source=source, page=page)]
|
149 |
+
if (
|
150 |
+
file_type == "txt"
|
151 |
+
or file_type == "docx"
|
152 |
+
or file_type == "srt"
|
153 |
+
or file_type == "tex"
|
154 |
+
):
|
155 |
document_chunks = self.splitter.split_documents(documents)
|
156 |
elif file_type == "pdf":
|
157 |
document_chunks = documents # Full page for now
|
|
|
179 |
self.documents = []
|
180 |
self.document_metadata = []
|
181 |
|
182 |
+
addl_metadata = get_metadata(
|
183 |
"https://dl4ds.github.io/sp2024/lectures/",
|
184 |
"https://dl4ds.github.io/sp2024/schedule/",
|
185 |
+
) # For any additional metadata
|
186 |
|
187 |
for file_index, file_path in enumerate(uploaded_files):
|
188 |
file_name = os.path.basename(file_path)
|
189 |
+
if file_name not in self.parent_document_names:
|
190 |
+
file_type = file_name.split(".")[-1].lower()
|
191 |
+
|
192 |
+
# try:
|
193 |
+
if file_type == "pdf":
|
194 |
+
documents = file_reader.read_pdf(file_path)
|
195 |
+
elif file_type == "txt":
|
196 |
+
documents = file_reader.read_txt(file_path)
|
197 |
+
elif file_type == "docx":
|
198 |
+
documents = file_reader.read_docx(file_path)
|
199 |
+
elif file_type == "srt":
|
200 |
+
documents = file_reader.read_srt(file_path)
|
201 |
+
elif file_type == "tex":
|
202 |
+
documents = file_reader.read_tex_from_url(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
else:
|
204 |
+
logger.warning(f"Unsupported file type: {file_type}")
|
205 |
+
continue
|
206 |
+
|
207 |
+
for doc in documents:
|
208 |
+
page_num = doc.metadata.get("page", 0)
|
209 |
+
self.documents.append(doc.page_content)
|
210 |
+
self.document_metadata.append(
|
211 |
+
{"source": file_path, "page": page_num}
|
|
|
|
|
|
|
|
|
212 |
)
|
213 |
+
metadata = addl_metadata.get(file_path, {})
|
214 |
+
self.document_metadata[-1].update(metadata)
|
215 |
+
|
216 |
+
self.child_document_names.append(f"{file_name}_{page_num}")
|
217 |
+
|
218 |
+
self.parent_document_names.append(file_name)
|
219 |
+
if self.config["embedding_options"]["db_option"] not in [
|
220 |
+
"RAGatouille"
|
221 |
+
]:
|
222 |
+
document_chunks = self.process_chunks(
|
223 |
+
self.documents[-1],
|
224 |
+
file_type,
|
225 |
+
source=file_path,
|
226 |
+
page=page_num,
|
227 |
+
metadata=metadata,
|
228 |
+
)
|
229 |
+
self.document_chunks_full.extend(document_chunks)
|
230 |
|
231 |
# except Exception as e:
|
232 |
# logger.error(f"Error processing file {file_name}: {str(e)}")
|
|
|
248 |
logger.info(f"Splitting weblinks: total of {len(weblinks)}")
|
249 |
|
250 |
for link_index, link in enumerate(weblinks):
|
251 |
+
if link not in self.parent_document_names:
|
252 |
+
try:
|
253 |
+
logger.info(f"\tSplitting link {link_index+1} : {link}")
|
254 |
+
if "youtube" in link:
|
255 |
+
documents = file_reader.read_youtube_transcript(link)
|
256 |
+
else:
|
257 |
+
documents = file_reader.read_html(link)
|
258 |
+
|
259 |
+
for doc in documents:
|
260 |
+
page_num = doc.metadata.get("page", 0)
|
261 |
+
self.documents.append(doc.page_content)
|
262 |
+
self.document_metadata.append(
|
263 |
+
{"source": link, "page": page_num}
|
264 |
+
)
|
265 |
+
self.child_document_names.append(f"{link}")
|
266 |
+
|
267 |
+
self.parent_document_names.append(link)
|
268 |
+
if self.config["embedding_options"]["db_option"] not in [
|
269 |
+
"RAGatouille"
|
270 |
+
]:
|
271 |
+
document_chunks = self.process_chunks(
|
272 |
+
self.documents[-1],
|
273 |
+
"txt",
|
274 |
+
source=link,
|
275 |
+
page=0,
|
276 |
+
metadata={"source_type": "webpage"},
|
277 |
+
)
|
278 |
+
self.document_chunks_full.extend(document_chunks)
|
279 |
+
except Exception as e:
|
280 |
+
logger.error(
|
281 |
+
f"Error splitting link {link_index+1} : {link}: {str(e)}"
|
282 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
|
285 |
class DataLoader:
|
code/modules/embedding_model_loader.py
CHANGED
@@ -24,8 +24,12 @@ class EmbeddingModelLoader:
|
|
24 |
)
|
25 |
else:
|
26 |
embedding_model = HuggingFaceEmbeddings(
|
27 |
-
model_name="
|
28 |
-
model_kwargs={
|
|
|
|
|
|
|
|
|
29 |
)
|
30 |
# embedding_model = LlamaCppEmbeddings(
|
31 |
# model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
|
|
|
24 |
)
|
25 |
else:
|
26 |
embedding_model = HuggingFaceEmbeddings(
|
27 |
+
model_name=self.config["embedding_options"]["model"],
|
28 |
+
model_kwargs={
|
29 |
+
"device": "cpu",
|
30 |
+
"token": f"{HUGGINGFACE_TOKEN}",
|
31 |
+
"trust_remote_code": True,
|
32 |
+
},
|
33 |
)
|
34 |
# embedding_model = LlamaCppEmbeddings(
|
35 |
# model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
|
code/modules/helpers.py
CHANGED
@@ -1,11 +1,15 @@
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
from tqdm import tqdm
|
4 |
-
from urllib.parse import urlparse
|
5 |
import chainlit as cl
|
6 |
from langchain import PromptTemplate
|
7 |
import requests
|
8 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
try:
|
11 |
from modules.constants import *
|
@@ -19,82 +23,112 @@ Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db
|
|
19 |
|
20 |
class WebpageCrawler:
|
21 |
def __init__(self):
|
22 |
-
|
23 |
|
24 |
-
def
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
27 |
|
28 |
-
def url_exists(self, url):
|
29 |
try:
|
30 |
response = requests.head(url)
|
31 |
return response.status_code == 200
|
32 |
except requests.ConnectionError:
|
33 |
return False
|
34 |
|
35 |
-
def get_links(self, website_link, base_url
|
36 |
-
|
37 |
-
base_url = website_link
|
38 |
-
html_data = self.getdata(website_link)
|
39 |
soup = BeautifulSoup(html_data, "html.parser")
|
40 |
list_links = []
|
41 |
for link in soup.find_all("a", href=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
# Append to list if new link contains original link
|
47 |
-
if str(link["href"]).startswith((str(website_link))):
|
48 |
-
list_links.append(link["href"])
|
49 |
-
|
50 |
-
# Include all href that do not start with website link but with "/"
|
51 |
-
if str(link["href"]).startswith("/"):
|
52 |
-
if link["href"] not in self.dict_href_links:
|
53 |
-
print(link["href"])
|
54 |
-
self.dict_href_links[link["href"]] = None
|
55 |
-
link_with_www = base_url + link["href"][1:]
|
56 |
-
if self.url_exists(link_with_www):
|
57 |
-
print("adjusted link =", link_with_www)
|
58 |
-
list_links.append(link_with_www)
|
59 |
-
|
60 |
-
# Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
|
61 |
-
dict_links = dict.fromkeys(list_links, "Not-checked")
|
62 |
-
return dict_links
|
63 |
-
|
64 |
-
def get_subpage_links(self, l, base_url):
|
65 |
-
for link in tqdm(l):
|
66 |
-
print("checking link:", link)
|
67 |
-
if not link.endswith("/"):
|
68 |
-
l[link] = "Checked"
|
69 |
-
dict_links_subpages = {}
|
70 |
else:
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
return
|
82 |
-
|
83 |
-
def get_all_pages(self, url, base_url):
|
84 |
-
dict_links = {url: "Not-checked"}
|
85 |
-
self.dict_href_links = {}
|
86 |
-
counter, counter2 = None, 0
|
87 |
-
while counter != 0:
|
88 |
-
counter2 += 1
|
89 |
-
dict_links2 = self.get_subpage_links(dict_links, base_url)
|
90 |
-
# Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked"
|
91 |
-
# https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python
|
92 |
-
counter = sum(value == "Not-checked" for value in dict_links2.values())
|
93 |
-
dict_links = dict_links2
|
94 |
-
checked_urls = [
|
95 |
-
url for url, status in dict_links.items() if status == "Checked"
|
96 |
-
]
|
97 |
-
return checked_urls
|
98 |
|
99 |
|
100 |
def get_urls_from_file(file_path: str):
|
@@ -183,40 +217,38 @@ def get_sources(res, answer):
|
|
183 |
|
184 |
name = f"Source {idx + 1} Text\n"
|
185 |
full_answer += name
|
186 |
-
source_elements.append(
|
|
|
|
|
187 |
|
188 |
# Add a PDF element if the source is a PDF file
|
189 |
if source_data["url"].lower().endswith(".pdf"):
|
190 |
name = f"Source {idx + 1} PDF\n"
|
191 |
full_answer += name
|
192 |
pdf_url = f"{source_data['url']}#page={source_data['page']+1}"
|
193 |
-
source_elements.append(cl.Pdf(name=name, url=pdf_url))
|
194 |
-
|
195 |
-
# Finally, include lecture metadata for each unique source
|
196 |
-
# displayed_urls = set()
|
197 |
-
# full_answer += "\n**Metadata:**\n"
|
198 |
-
# for url_name, source_data in source_dict.items():
|
199 |
-
# if source_data["url"] not in displayed_urls:
|
200 |
-
# full_answer += f"\nSource: {source_data['url']}\n"
|
201 |
-
# full_answer += f"Type: {source_data['source_type']}\n"
|
202 |
-
# full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
|
203 |
-
# full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
|
204 |
-
# full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
|
205 |
-
# displayed_urls.add(source_data["url"])
|
206 |
full_answer += "\n**Metadata:**\n"
|
207 |
-
for url_name, source_data in source_dict.items():
|
208 |
-
full_answer += f"\nSource
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
return full_answer, source_elements
|
217 |
|
218 |
|
219 |
-
def
|
220 |
"""
|
221 |
Function to get the lecture metadata from the lectures and schedule URLs.
|
222 |
"""
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
from tqdm import tqdm
|
|
|
4 |
import chainlit as cl
|
5 |
from langchain import PromptTemplate
|
6 |
import requests
|
7 |
from bs4 import BeautifulSoup
|
8 |
+
from urllib.parse import urlparse, urljoin, urldefrag
|
9 |
+
import asyncio
|
10 |
+
import aiohttp
|
11 |
+
from aiohttp import ClientSession
|
12 |
+
from typing import Dict, Any, List
|
13 |
|
14 |
try:
|
15 |
from modules.constants import *
|
|
|
23 |
|
24 |
class WebpageCrawler:
|
25 |
def __init__(self):
|
26 |
+
self.dict_href_links = {}
|
27 |
|
28 |
+
async def fetch(self, session: ClientSession, url: str) -> str:
|
29 |
+
async with session.get(url) as response:
|
30 |
+
try:
|
31 |
+
return await response.text()
|
32 |
+
except UnicodeDecodeError:
|
33 |
+
return await response.text(encoding="latin1")
|
34 |
|
35 |
+
def url_exists(self, url: str) -> bool:
|
36 |
try:
|
37 |
response = requests.head(url)
|
38 |
return response.status_code == 200
|
39 |
except requests.ConnectionError:
|
40 |
return False
|
41 |
|
42 |
+
async def get_links(self, session: ClientSession, website_link: str, base_url: str):
|
43 |
+
html_data = await self.fetch(session, website_link)
|
|
|
|
|
44 |
soup = BeautifulSoup(html_data, "html.parser")
|
45 |
list_links = []
|
46 |
for link in soup.find_all("a", href=True):
|
47 |
+
href = link["href"].strip()
|
48 |
+
full_url = urljoin(base_url, href)
|
49 |
+
normalized_url = self.normalize_url(full_url) # sections removed
|
50 |
+
if (
|
51 |
+
normalized_url not in self.dict_href_links
|
52 |
+
and self.is_child_url(normalized_url, base_url)
|
53 |
+
and self.url_exists(normalized_url)
|
54 |
+
):
|
55 |
+
self.dict_href_links[normalized_url] = None
|
56 |
+
list_links.append(normalized_url)
|
57 |
+
|
58 |
+
return list_links
|
59 |
+
|
60 |
+
async def get_subpage_links(
|
61 |
+
self, session: ClientSession, urls: list, base_url: str
|
62 |
+
):
|
63 |
+
tasks = [self.get_links(session, url, base_url) for url in urls]
|
64 |
+
results = await asyncio.gather(*tasks)
|
65 |
+
all_links = [link for sublist in results for link in sublist]
|
66 |
+
return all_links
|
67 |
+
|
68 |
+
async def get_all_pages(self, url: str, base_url: str):
|
69 |
+
async with aiohttp.ClientSession() as session:
|
70 |
+
dict_links = {url: "Not-checked"}
|
71 |
+
counter = None
|
72 |
+
while counter != 0:
|
73 |
+
unchecked_links = [
|
74 |
+
link
|
75 |
+
for link, status in dict_links.items()
|
76 |
+
if status == "Not-checked"
|
77 |
+
]
|
78 |
+
if not unchecked_links:
|
79 |
+
break
|
80 |
+
new_links = await self.get_subpage_links(
|
81 |
+
session, unchecked_links, base_url
|
82 |
+
)
|
83 |
+
for link in unchecked_links:
|
84 |
+
dict_links[link] = "Checked"
|
85 |
+
print(f"Checked: {link}")
|
86 |
+
dict_links.update(
|
87 |
+
{
|
88 |
+
link: "Not-checked"
|
89 |
+
for link in new_links
|
90 |
+
if link not in dict_links
|
91 |
+
}
|
92 |
+
)
|
93 |
+
counter = len(
|
94 |
+
[
|
95 |
+
status
|
96 |
+
for status in dict_links.values()
|
97 |
+
if status == "Not-checked"
|
98 |
+
]
|
99 |
+
)
|
100 |
+
|
101 |
+
checked_urls = [
|
102 |
+
url for url, status in dict_links.items() if status == "Checked"
|
103 |
+
]
|
104 |
+
return checked_urls
|
105 |
+
|
106 |
+
def is_webpage(self, url: str) -> bool:
|
107 |
+
try:
|
108 |
+
response = requests.head(url, allow_redirects=True)
|
109 |
+
content_type = response.headers.get("Content-Type", "").lower()
|
110 |
+
return "text/html" in content_type
|
111 |
+
except requests.RequestException:
|
112 |
+
return False
|
113 |
+
|
114 |
+
def clean_url_list(self, urls):
|
115 |
+
files, webpages = [], []
|
116 |
|
117 |
+
for url in urls:
|
118 |
+
if self.is_webpage(url):
|
119 |
+
webpages.append(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
else:
|
121 |
+
files.append(url)
|
122 |
+
|
123 |
+
return files, webpages
|
124 |
+
|
125 |
+
def is_child_url(self, url, base_url):
|
126 |
+
return url.startswith(base_url)
|
127 |
+
|
128 |
+
def normalize_url(self, url: str):
|
129 |
+
# Strip the fragment identifier
|
130 |
+
defragged_url, _ = urldefrag(url)
|
131 |
+
return defragged_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
|
134 |
def get_urls_from_file(file_path: str):
|
|
|
217 |
|
218 |
name = f"Source {idx + 1} Text\n"
|
219 |
full_answer += name
|
220 |
+
source_elements.append(
|
221 |
+
cl.Text(name=name, content=source_data["text"], display="side")
|
222 |
+
)
|
223 |
|
224 |
# Add a PDF element if the source is a PDF file
|
225 |
if source_data["url"].lower().endswith(".pdf"):
|
226 |
name = f"Source {idx + 1} PDF\n"
|
227 |
full_answer += name
|
228 |
pdf_url = f"{source_data['url']}#page={source_data['page']+1}"
|
229 |
+
source_elements.append(cl.Pdf(name=name, url=pdf_url, display="side"))
|
230 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
full_answer += "\n**Metadata:**\n"
|
232 |
+
for idx, (url_name, source_data) in enumerate(source_dict.items()):
|
233 |
+
full_answer += f"\nSource {idx + 1} Metadata:\n"
|
234 |
+
source_elements.append(
|
235 |
+
cl.Text(
|
236 |
+
name=f"Source {idx + 1} Metadata",
|
237 |
+
content=f"Source: {source_data['url']}\n"
|
238 |
+
f"Page: {source_data['page']}\n"
|
239 |
+
f"Type: {source_data['source_type']}\n"
|
240 |
+
f"Date: {source_data['date']}\n"
|
241 |
+
f"TL;DR: {source_data['lecture_tldr']}\n"
|
242 |
+
f"Lecture Recording: {source_data['lecture_recording']}\n"
|
243 |
+
f"Suggested Readings: {source_data['suggested_readings']}\n",
|
244 |
+
display="side",
|
245 |
+
)
|
246 |
+
)
|
247 |
|
248 |
return full_answer, source_elements
|
249 |
|
250 |
|
251 |
+
def get_metadata(lectures_url, schedule_url):
|
252 |
"""
|
253 |
Function to get the lecture metadata from the lectures and schedule URLs.
|
254 |
"""
|
code/modules/llm_tutor.py
CHANGED
@@ -5,18 +5,99 @@ from langchain_community.embeddings import OpenAIEmbeddings
|
|
5 |
from langchain.vectorstores import FAISS
|
6 |
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
|
7 |
from langchain.llms import CTransformers
|
8 |
-
from langchain.memory import ConversationBufferWindowMemory
|
9 |
from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
|
10 |
import os
|
11 |
from modules.constants import *
|
12 |
from modules.helpers import get_prompt
|
13 |
from modules.chat_model_loader import ChatModelLoader
|
14 |
from modules.vector_db import VectorDB, VectorDBScore
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
class LLMTutor:
|
18 |
def __init__(self, config, logger=None):
|
19 |
self.config = config
|
|
|
20 |
self.vector_db = VectorDB(config, logger=logger)
|
21 |
if self.config["embedding_options"]["embedd_files"]:
|
22 |
self.vector_db.create_database()
|
@@ -36,26 +117,28 @@ class LLMTutor:
|
|
36 |
if self.config["embedding_options"]["db_option"] in ["FAISS", "Chroma"]:
|
37 |
retriever = VectorDBScore(
|
38 |
vectorstore=db,
|
39 |
-
search_type="similarity_score_threshold",
|
40 |
-
search_kwargs={
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
},
|
46 |
)
|
47 |
elif self.config["embedding_options"]["db_option"] == "RAGatouille":
|
48 |
retriever = db.as_langchain_retriever(
|
49 |
k=self.config["embedding_options"]["search_top_k"]
|
50 |
)
|
51 |
if self.config["llm_params"]["use_history"]:
|
52 |
-
memory =
|
|
|
53 |
k=self.config["llm_params"]["memory_window"],
|
54 |
memory_key="chat_history",
|
55 |
return_messages=True,
|
56 |
output_key="answer",
|
|
|
57 |
)
|
58 |
-
qa_chain =
|
59 |
llm=llm,
|
60 |
chain_type="stuff",
|
61 |
retriever=retriever,
|
@@ -82,7 +165,6 @@ class LLMTutor:
|
|
82 |
# QA Model Function
|
83 |
def qa_bot(self):
|
84 |
db = self.vector_db.load_database()
|
85 |
-
self.llm = self.load_llm()
|
86 |
qa_prompt = self.set_custom_prompt()
|
87 |
qa = self.retrieval_qa_chain(self.llm, qa_prompt, db)
|
88 |
|
|
|
5 |
from langchain.vectorstores import FAISS
|
6 |
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
|
7 |
from langchain.llms import CTransformers
|
8 |
+
from langchain.memory import ConversationBufferWindowMemory, ConversationSummaryBufferMemory
|
9 |
from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
|
10 |
import os
|
11 |
from modules.constants import *
|
12 |
from modules.helpers import get_prompt
|
13 |
from modules.chat_model_loader import ChatModelLoader
|
14 |
from modules.vector_db import VectorDB, VectorDBScore
|
15 |
+
from typing import Dict, Any, Optional
|
16 |
+
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
17 |
+
import inspect
|
18 |
+
from langchain.chains.conversational_retrieval.base import _get_chat_history
|
19 |
+
|
20 |
+
|
21 |
+
class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
|
22 |
+
async def _acall(
|
23 |
+
self,
|
24 |
+
inputs: Dict[str, Any],
|
25 |
+
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
|
26 |
+
) -> Dict[str, Any]:
|
27 |
+
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
|
28 |
+
question = inputs["question"]
|
29 |
+
get_chat_history = self.get_chat_history or _get_chat_history
|
30 |
+
chat_history_str = get_chat_history(inputs["chat_history"])
|
31 |
+
print(f"chat_history_str: {chat_history_str}")
|
32 |
+
if chat_history_str:
|
33 |
+
callbacks = _run_manager.get_child()
|
34 |
+
new_question = await self.question_generator.arun(
|
35 |
+
question=question, chat_history=chat_history_str, callbacks=callbacks
|
36 |
+
)
|
37 |
+
else:
|
38 |
+
new_question = question
|
39 |
+
accepts_run_manager = (
|
40 |
+
"run_manager" in inspect.signature(self._aget_docs).parameters
|
41 |
+
)
|
42 |
+
if accepts_run_manager:
|
43 |
+
docs = await self._aget_docs(new_question, inputs, run_manager=_run_manager)
|
44 |
+
else:
|
45 |
+
docs = await self._aget_docs(new_question, inputs) # type: ignore[call-arg]
|
46 |
+
|
47 |
+
output: Dict[str, Any] = {}
|
48 |
+
if self.response_if_no_docs_found is not None and len(docs) == 0:
|
49 |
+
output[self.output_key] = self.response_if_no_docs_found
|
50 |
+
else:
|
51 |
+
new_inputs = inputs.copy()
|
52 |
+
if self.rephrase_question:
|
53 |
+
new_inputs["question"] = new_question
|
54 |
+
new_inputs["chat_history"] = chat_history_str
|
55 |
+
|
56 |
+
# Prepare the final prompt with metadata
|
57 |
+
context = "\n\n".join(
|
58 |
+
[
|
59 |
+
f"Document content: {doc.page_content}\nMetadata: {doc.metadata}"
|
60 |
+
for doc in docs
|
61 |
+
]
|
62 |
+
)
|
63 |
+
final_prompt = f"""
|
64 |
+
You are an AI Tutor for the course DS598, taught by Prof. Thomas Gardos. Use the following pieces of information to answer the user's question.
|
65 |
+
If you don't know the answer, just say that you don't know—don't try to make up an answer.
|
66 |
+
Use the chat history to answer the question only if it's relevant; otherwise, ignore it. The context for the answer will be under "Document context:".
|
67 |
+
Use the metadata from each document to guide the user to the correct sources.
|
68 |
+
The context is ordered by relevance to the question. Give more weight to the most relevant documents.
|
69 |
+
Talk in a friendly and personalized manner, similar to how you would speak to a friend who needs help. Make the conversation engaging and avoid sounding repetitive or robotic.
|
70 |
+
|
71 |
+
Chat History:
|
72 |
+
{chat_history_str}
|
73 |
+
|
74 |
+
Context:
|
75 |
+
{context}
|
76 |
+
|
77 |
+
Question: {new_question}
|
78 |
+
AI Tutor:
|
79 |
+
"""
|
80 |
+
|
81 |
+
new_inputs["input"] = final_prompt
|
82 |
+
new_inputs["question"] = final_prompt
|
83 |
+
output["final_prompt"] = final_prompt
|
84 |
+
|
85 |
+
answer = await self.combine_docs_chain.arun(
|
86 |
+
input_documents=docs, callbacks=_run_manager.get_child(), **new_inputs
|
87 |
+
)
|
88 |
+
output[self.output_key] = answer
|
89 |
+
|
90 |
+
if self.return_source_documents:
|
91 |
+
output["source_documents"] = docs
|
92 |
+
if self.return_generated_question:
|
93 |
+
output["generated_question"] = new_question
|
94 |
+
return output
|
95 |
|
96 |
|
97 |
class LLMTutor:
|
98 |
def __init__(self, config, logger=None):
|
99 |
self.config = config
|
100 |
+
self.llm = self.load_llm()
|
101 |
self.vector_db = VectorDB(config, logger=logger)
|
102 |
if self.config["embedding_options"]["embedd_files"]:
|
103 |
self.vector_db.create_database()
|
|
|
117 |
if self.config["embedding_options"]["db_option"] in ["FAISS", "Chroma"]:
|
118 |
retriever = VectorDBScore(
|
119 |
vectorstore=db,
|
120 |
+
# search_type="similarity_score_threshold",
|
121 |
+
# search_kwargs={
|
122 |
+
# "score_threshold": self.config["embedding_options"][
|
123 |
+
# "score_threshold"
|
124 |
+
# ],
|
125 |
+
# "k": self.config["embedding_options"]["search_top_k"],
|
126 |
+
# },
|
127 |
)
|
128 |
elif self.config["embedding_options"]["db_option"] == "RAGatouille":
|
129 |
retriever = db.as_langchain_retriever(
|
130 |
k=self.config["embedding_options"]["search_top_k"]
|
131 |
)
|
132 |
if self.config["llm_params"]["use_history"]:
|
133 |
+
memory = ConversationSummaryBufferMemory(
|
134 |
+
llm = llm,
|
135 |
k=self.config["llm_params"]["memory_window"],
|
136 |
memory_key="chat_history",
|
137 |
return_messages=True,
|
138 |
output_key="answer",
|
139 |
+
max_token_limit=128,
|
140 |
)
|
141 |
+
qa_chain = CustomConversationalRetrievalChain.from_llm(
|
142 |
llm=llm,
|
143 |
chain_type="stuff",
|
144 |
retriever=retriever,
|
|
|
165 |
# QA Model Function
|
166 |
def qa_bot(self):
|
167 |
db = self.vector_db.load_database()
|
|
|
168 |
qa_prompt = self.set_custom_prompt()
|
169 |
qa = self.retrieval_qa_chain(self.llm, qa_prompt, db)
|
170 |
|
code/modules/vector_db.py
CHANGED
@@ -96,21 +96,17 @@ class VectorDB:
|
|
96 |
if self.config["embedding_options"]["expand_urls"]:
|
97 |
all_urls = []
|
98 |
for url in urls:
|
99 |
-
|
100 |
-
all_urls.extend(
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
urls = all_urls
|
102 |
return files, urls
|
103 |
|
104 |
-
def clean_url_list(self, urls):
|
105 |
-
# get lecture pdf links
|
106 |
-
lecture_pdfs = [link for link in urls if link.endswith(".pdf")]
|
107 |
-
lecture_pdfs = [link for link in lecture_pdfs if "lecture" in link.lower()]
|
108 |
-
urls = [
|
109 |
-
link for link in urls if link.endswith("/")
|
110 |
-
] # only keep links that end with a '/'. Extract Files Seperately
|
111 |
-
|
112 |
-
return urls, lecture_pdfs
|
113 |
-
|
114 |
def create_embedding_model(self):
|
115 |
self.logger.info("Creating embedding function")
|
116 |
self.embedding_model_loader = EmbeddingModelLoader(self.config)
|
@@ -158,12 +154,11 @@ class VectorDB:
|
|
158 |
data_loader = DataLoader(self.config)
|
159 |
self.logger.info("Loading data")
|
160 |
files, urls = self.load_files()
|
161 |
-
|
162 |
-
files += lecture_pdfs
|
163 |
if "storage/data/urls.txt" in files:
|
164 |
files.remove("storage/data/urls.txt")
|
165 |
document_chunks, document_names, documents, document_metadata = (
|
166 |
-
data_loader.get_chunks(files,
|
167 |
)
|
168 |
self.logger.info("Completed loading data")
|
169 |
self.initialize_database(
|
|
|
96 |
if self.config["embedding_options"]["expand_urls"]:
|
97 |
all_urls = []
|
98 |
for url in urls:
|
99 |
+
loop = asyncio.get_event_loop()
|
100 |
+
all_urls.extend(
|
101 |
+
loop.run_until_complete(
|
102 |
+
self.webpage_crawler.get_all_pages(
|
103 |
+
url, url
|
104 |
+
) # only get child urls, if you want to get all urls, replace the second argument with the base url
|
105 |
+
)
|
106 |
+
)
|
107 |
urls = all_urls
|
108 |
return files, urls
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
def create_embedding_model(self):
|
111 |
self.logger.info("Creating embedding function")
|
112 |
self.embedding_model_loader = EmbeddingModelLoader(self.config)
|
|
|
154 |
data_loader = DataLoader(self.config)
|
155 |
self.logger.info("Loading data")
|
156 |
files, urls = self.load_files()
|
157 |
+
files, webpages = self.webpage_crawler.clean_url_list(urls)
|
|
|
158 |
if "storage/data/urls.txt" in files:
|
159 |
files.remove("storage/data/urls.txt")
|
160 |
document_chunks, document_names, documents, document_metadata = (
|
161 |
+
data_loader.get_chunks(files, webpages)
|
162 |
)
|
163 |
self.logger.info("Completed loading data")
|
164 |
self.initialize_database(
|
public/test.css
CHANGED
@@ -1,3 +1,16 @@
|
|
1 |
a[href*='https://github.com/Chainlit/chainlit'] {
|
2 |
visibility: hidden;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
}
|
|
|
1 |
a[href*='https://github.com/Chainlit/chainlit'] {
|
2 |
visibility: hidden;
|
3 |
+
}
|
4 |
+
|
5 |
+
.message-avatar .MuiAvatar-root {
|
6 |
+
background-color: transparent; /* Remove the background color */
|
7 |
+
color: #FFFFFF; /* Change this to your desired text color */
|
8 |
+
border: 0.25px solid #FFFFFF; /* Add a white border for the circle */
|
9 |
+
border-radius: 50%; /* Ensure the avatar remains circular */
|
10 |
+
background-image: url('http://localhost:8051/logo?theme=dark'); /* Path to your logo */
|
11 |
+
background-size: cover; /* Ensure the logo covers the entire avatar */
|
12 |
+
background-position: center; /* Center the logo */
|
13 |
+
background-repeat: no-repeat; /* Prevent the logo from repeating */
|
14 |
+
width: 38px; /* Adjust the width as needed */
|
15 |
+
height: 38px; /* Adjust the height as needed */
|
16 |
}
|
requirements.txt
CHANGED
@@ -1,20 +1,20 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
pysrt==1.1.2
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
python
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
beautifulsoup4==4.12.2
|
16 |
-
fake-useragent==1.4.0
|
17 |
-
git+https://github.com/huggingface/accelerate.git
|
18 |
-
llama-cpp-python
|
19 |
-
PyPDF2==3.0.1
|
20 |
-
ragatouille==0.0.8.post2
|
|
|
1 |
+
# Automatically generated by https://github.com/damnever/pigar.
|
2 |
+
|
3 |
+
beautifulsoup4==4.12.3
|
4 |
+
chainlit==1.1.202
|
5 |
+
langchain==0.1.20
|
6 |
+
langchain-community==0.0.38
|
7 |
+
langchain-core==0.1.52
|
8 |
+
llama-parse==0.4.4
|
9 |
pysrt==1.1.2
|
10 |
+
python-dotenv==1.0.1
|
11 |
+
PyYAML==6.0.1
|
12 |
+
RAGatouille==0.0.8.post2
|
13 |
+
requests==2.32.3
|
14 |
+
torch==2.3.1
|
15 |
+
tqdm==4.66.4
|
16 |
+
transformers==4.41.2
|
17 |
+
llama-cpp-python==0.2.77
|
18 |
+
fake_useragent==1.5.1
|
19 |
+
chromadb==0.5.0
|
20 |
+
pymupdf==1.24.5
|
|
|
|
|
|
|
|
|
|
|
|