Spaces:
Build error
Build error
Merge branch 'dev_branch' into chainlit_enhancements
Browse files- .github/workflows/push_to_hf_space_prototype.yml +14 -13
- code/.chainlit/config.toml +18 -15
- code/modules/chat/chat_model_loader.py +13 -0
- code/modules/config/config.yml +4 -1
- code/modules/config/constants.py +2 -1
- code/modules/dataloader/data_loader.py +82 -30
- code/modules/dataloader/helpers.py +22 -2
- code/modules/dataloader/pdf_readers/base.py +14 -0
- code/modules/dataloader/pdf_readers/llama.py +92 -0
- code/modules/dataloader/webpage_crawler.py +0 -1
.github/workflows/push_to_hf_space_prototype.yml
CHANGED
@@ -1,20 +1,21 @@
|
|
1 |
name: Push Prototype to HuggingFace
|
2 |
|
3 |
on:
|
4 |
-
|
5 |
-
branches:
|
6 |
-
|
7 |
-
|
|
|
8 |
|
9 |
jobs:
|
10 |
-
|
11 |
runs-on: ubuntu-latest
|
12 |
steps:
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
1 |
name: Push Prototype to HuggingFace
|
2 |
|
3 |
on:
|
4 |
+
push:
|
5 |
+
branches: [dev_branch]
|
6 |
+
|
7 |
+
# run this workflow manuall from the Actions tab
|
8 |
+
workflow_dispatch:
|
9 |
|
10 |
jobs:
|
11 |
+
sync-to-hub:
|
12 |
runs-on: ubuntu-latest
|
13 |
steps:
|
14 |
+
- uses: actions/checkout@v4
|
15 |
+
with:
|
16 |
+
fetch-depth: 0
|
17 |
+
lfs: true
|
18 |
+
- name: Deploy Prototype to HuggingFace
|
19 |
+
env:
|
20 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
21 |
+
run: git push https://trgardos:$HF_TOKEN@huggingface.co/spaces/dl4ds/tutor_dev dev_branch:main
|
code/.chainlit/config.toml
CHANGED
@@ -85,31 +85,34 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
|
|
85 |
# custom_build = "./public/build"
|
86 |
|
87 |
[UI.theme]
|
88 |
-
default = "
|
89 |
#layout = "wide"
|
90 |
#font_family = "Inter, sans-serif"
|
91 |
# Override default MUI light theme. (Check theme.ts)
|
92 |
[UI.theme.light]
|
93 |
-
background = "#FAFAFA"
|
94 |
-
paper = "#FFFFFF"
|
95 |
|
96 |
[UI.theme.light.primary]
|
97 |
-
main = "#
|
98 |
-
dark = "#
|
99 |
-
light = "#
|
100 |
[UI.theme.light.text]
|
101 |
-
primary = "#212121"
|
102 |
-
secondary = "#616161"
|
|
|
103 |
# Override default MUI dark theme. (Check theme.ts)
|
104 |
[UI.theme.dark]
|
105 |
-
background = "#
|
106 |
-
paper = "#
|
107 |
|
108 |
[UI.theme.dark.primary]
|
109 |
-
main = "#
|
110 |
-
dark = "#
|
111 |
-
light = "#
|
112 |
-
|
|
|
|
|
113 |
|
114 |
[meta]
|
115 |
-
generated_by = "1.1.
|
|
|
85 |
# custom_build = "./public/build"
|
86 |
|
87 |
[UI.theme]
|
88 |
+
default = "dark"
|
89 |
#layout = "wide"
|
90 |
#font_family = "Inter, sans-serif"
|
91 |
# Override default MUI light theme. (Check theme.ts)
|
92 |
[UI.theme.light]
|
93 |
+
#background = "#FAFAFA"
|
94 |
+
#paper = "#FFFFFF"
|
95 |
|
96 |
[UI.theme.light.primary]
|
97 |
+
#main = "#F80061"
|
98 |
+
#dark = "#980039"
|
99 |
+
#light = "#FFE7EB"
|
100 |
[UI.theme.light.text]
|
101 |
+
#primary = "#212121"
|
102 |
+
#secondary = "#616161"
|
103 |
+
|
104 |
# Override default MUI dark theme. (Check theme.ts)
|
105 |
[UI.theme.dark]
|
106 |
+
#background = "#FAFAFA"
|
107 |
+
#paper = "#FFFFFF"
|
108 |
|
109 |
[UI.theme.dark.primary]
|
110 |
+
#main = "#F80061"
|
111 |
+
#dark = "#980039"
|
112 |
+
#light = "#FFE7EB"
|
113 |
+
[UI.theme.dark.text]
|
114 |
+
#primary = "#EEEEEE"
|
115 |
+
#secondary = "#BDBDBD"
|
116 |
|
117 |
[meta]
|
118 |
+
generated_by = "1.1.304"
|
code/modules/chat/chat_model_loader.py
CHANGED
@@ -5,6 +5,8 @@ from langchain_community.llms import LlamaCpp
|
|
5 |
import torch
|
6 |
import transformers
|
7 |
import os
|
|
|
|
|
8 |
from langchain.callbacks.manager import CallbackManager
|
9 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
10 |
from modules.config.constants import LLAMA_PATH
|
@@ -15,6 +17,14 @@ class ChatModelLoader:
|
|
15 |
self.config = config
|
16 |
self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def load_chat_model(self):
|
19 |
if self.config["llm_params"]["llm_loader"] in [
|
20 |
"gpt-3.5-turbo-1106",
|
@@ -24,6 +34,9 @@ class ChatModelLoader:
|
|
24 |
llm = ChatOpenAI(model_name=self.config["llm_params"]["llm_loader"])
|
25 |
elif self.config["llm_params"]["llm_loader"] == "local_llm":
|
26 |
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
|
|
|
|
|
|
|
27 |
llm = LlamaCpp(
|
28 |
model_path=LLAMA_PATH,
|
29 |
n_batch=n_batch,
|
|
|
5 |
import torch
|
6 |
import transformers
|
7 |
import os
|
8 |
+
from pathlib import Path
|
9 |
+
from huggingface_hub import hf_hub_download
|
10 |
from langchain.callbacks.manager import CallbackManager
|
11 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
12 |
from modules.config.constants import LLAMA_PATH
|
|
|
17 |
self.config = config
|
18 |
self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
19 |
|
20 |
+
def _verify_model_cache(self, model_cache_path):
|
21 |
+
hf_hub_download(
|
22 |
+
repo_id=self.config["llm_params"]["local_llm_params"]["repo_id"],
|
23 |
+
filename=self.config["llm_params"]["local_llm_params"]["filename"],
|
24 |
+
cache_dir=model_cache_path,
|
25 |
+
)
|
26 |
+
return str(list(Path(model_cache_path).glob("*/snapshots/*/*.gguf"))[0])
|
27 |
+
|
28 |
def load_chat_model(self):
|
29 |
if self.config["llm_params"]["llm_loader"] in [
|
30 |
"gpt-3.5-turbo-1106",
|
|
|
34 |
llm = ChatOpenAI(model_name=self.config["llm_params"]["llm_loader"])
|
35 |
elif self.config["llm_params"]["llm_loader"] == "local_llm":
|
36 |
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
|
37 |
+
model_path = self._verify_model_cache(
|
38 |
+
self.config["llm_params"]["local_llm_params"]["model"]
|
39 |
+
)
|
40 |
llm = LlamaCpp(
|
41 |
model_path=LLAMA_PATH,
|
42 |
n_batch=n_batch,
|
code/modules/config/config.yml
CHANGED
@@ -35,6 +35,9 @@ llm_params:
|
|
35 |
temperature: 0.7 # float
|
36 |
local_llm_params:
|
37 |
temperature: 0.7 # float
|
|
|
|
|
|
|
38 |
stream: False # bool
|
39 |
|
40 |
chat_logging:
|
@@ -52,4 +55,4 @@ splitter_options:
|
|
52 |
chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
|
53 |
front_chunks_to_remove : null # int or None
|
54 |
last_chunks_to_remove : null # int or None
|
55 |
-
delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings
|
|
|
35 |
temperature: 0.7 # float
|
36 |
local_llm_params:
|
37 |
temperature: 0.7 # float
|
38 |
+
repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
|
39 |
+
filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
|
40 |
+
pdf_reader: 'llama' # str [llama, pymupdf, gpt]
|
41 |
stream: False # bool
|
42 |
|
43 |
chat_logging:
|
|
|
55 |
chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
|
56 |
front_chunks_to_remove : null # int or None
|
57 |
last_chunks_to_remove : null # int or None
|
58 |
+
delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings
|
code/modules/config/constants.py
CHANGED
@@ -6,6 +6,7 @@ load_dotenv()
|
|
6 |
# API Keys - Loaded from the .env file
|
7 |
|
8 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
9 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
10 |
LITERAL_API_KEY_LOGGING = os.getenv("LITERAL_API_KEY_LOGGING")
|
11 |
LITERAL_API_URL = os.getenv("LITERAL_API_URL")
|
@@ -17,6 +18,6 @@ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me question
|
|
17 |
|
18 |
# Model Paths
|
19 |
|
20 |
-
LLAMA_PATH = "../storage/models/tinyllama
|
21 |
|
22 |
RETRIEVER_HF_PATHS = {"RAGatouille": "XThomasBU/Colbert_Index"}
|
|
|
6 |
# API Keys - Loaded from the .env file
|
7 |
|
8 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
9 |
+
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
10 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
11 |
LITERAL_API_KEY_LOGGING = os.getenv("LITERAL_API_KEY_LOGGING")
|
12 |
LITERAL_API_URL = os.getenv("LITERAL_API_URL")
|
|
|
18 |
|
19 |
# Model Paths
|
20 |
|
21 |
+
LLAMA_PATH = "../storage/models/tinyllama"
|
22 |
|
23 |
RETRIEVER_HF_PATHS = {"RAGatouille": "XThomasBU/Colbert_Index"}
|
code/modules/dataloader/data_loader.py
CHANGED
@@ -20,26 +20,79 @@ from langchain_community.llms import OpenAI
|
|
20 |
from langchain import PromptTemplate
|
21 |
import json
|
22 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
return loader
|
34 |
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
class FileReader:
|
40 |
-
def __init__(self, logger):
|
41 |
-
self.pdf_reader = PDFReader()
|
42 |
self.logger = logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
def extract_text_from_pdf(self, pdf_path):
|
45 |
text = ""
|
@@ -51,20 +104,12 @@ class FileReader:
|
|
51 |
text += page.extract_text()
|
52 |
return text
|
53 |
|
54 |
-
def download_pdf_from_url(self, pdf_url):
|
55 |
-
response = requests.get(pdf_url)
|
56 |
-
if response.status_code == 200:
|
57 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
58 |
-
temp_file.write(response.content)
|
59 |
-
temp_file_path = temp_file.name
|
60 |
-
return temp_file_path
|
61 |
-
else:
|
62 |
-
self.logger.error(f"Failed to download PDF from URL: {pdf_url}")
|
63 |
-
return None
|
64 |
-
|
65 |
def read_pdf(self, temp_file_path: str):
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
68 |
return documents
|
69 |
|
70 |
def read_txt(self, temp_file_path: str):
|
@@ -179,7 +224,6 @@ class ChunkProcessor:
|
|
179 |
"https://dl4ds.github.io/sp2024/lectures/",
|
180 |
"https://dl4ds.github.io/sp2024/schedule/",
|
181 |
) # For any additional metadata
|
182 |
-
|
183 |
with ThreadPoolExecutor() as executor:
|
184 |
executor.map(
|
185 |
self.process_file,
|
@@ -245,16 +289,17 @@ class ChunkProcessor:
|
|
245 |
)
|
246 |
self.document_chunks_full.extend(document_chunks)
|
247 |
|
|
|
248 |
self.document_data[file_path] = file_data
|
249 |
self.document_metadata[file_path] = file_metadata
|
250 |
|
251 |
def process_file(self, file_path, file_index, file_reader, addl_metadata):
|
252 |
file_name = os.path.basename(file_path)
|
|
|
253 |
if file_name in self.document_data:
|
254 |
return
|
255 |
|
256 |
-
file_type = file_name.split(".")[-1]
|
257 |
-
self.logger.info(f"Reading file {file_index + 1}: {file_path}")
|
258 |
|
259 |
read_methods = {
|
260 |
"pdf": file_reader.read_pdf,
|
@@ -269,6 +314,7 @@ class ChunkProcessor:
|
|
269 |
|
270 |
try:
|
271 |
documents = read_methods[file_type](file_path)
|
|
|
272 |
self.process_documents(
|
273 |
documents, file_path, file_type, "file", addl_metadata
|
274 |
)
|
@@ -330,7 +376,7 @@ class ChunkProcessor:
|
|
330 |
|
331 |
class DataLoader:
|
332 |
def __init__(self, config, logger=None):
|
333 |
-
self.file_reader = FileReader(logger=logger)
|
334 |
self.chunk_processor = ChunkProcessor(config, logger=logger)
|
335 |
|
336 |
def get_chunks(self, uploaded_files, weblinks):
|
@@ -348,13 +394,19 @@ if __name__ == "__main__":
|
|
348 |
with open("../code/modules/config/config.yml", "r") as f:
|
349 |
config = yaml.safe_load(f)
|
350 |
|
|
|
|
|
|
|
|
|
|
|
351 |
data_loader = DataLoader(config, logger=logger)
|
352 |
document_chunks, document_names, documents, document_metadata = (
|
353 |
data_loader.get_chunks(
|
|
|
354 |
[],
|
355 |
-
["https://dl4ds.github.io/sp2024/"],
|
356 |
)
|
357 |
)
|
358 |
|
359 |
-
print(document_names)
|
360 |
print(len(document_chunks))
|
|
|
|
20 |
from langchain import PromptTemplate
|
21 |
import json
|
22 |
from concurrent.futures import ThreadPoolExecutor
|
23 |
+
from urllib.parse import urljoin
|
24 |
+
import html2text
|
25 |
+
import bs4
|
26 |
+
import tempfile
|
27 |
+
import PyPDF2
|
28 |
+
from modules.dataloader.pdf_readers.base import PDFReader
|
29 |
+
from modules.dataloader.pdf_readers.llama import LlamaParser
|
30 |
+
|
31 |
+
try:
|
32 |
+
from modules.dataloader.helpers import get_metadata, download_pdf_from_url
|
33 |
+
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
34 |
+
except:
|
35 |
+
from dataloader.helpers import get_metadata, download_pdf_from_url
|
36 |
+
from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
37 |
+
|
38 |
+
logger = logging.getLogger(__name__)
|
39 |
+
BASE_DIR = os.getcwd()
|
40 |
+
|
41 |
+
|
42 |
+
class HTMLReader:
|
43 |
+
def __init__(self):
|
44 |
+
pass
|
45 |
|
46 |
+
def read_url(self, url):
|
47 |
+
response = requests.get(url)
|
48 |
+
if response.status_code == 200:
|
49 |
+
return response.text
|
50 |
+
else:
|
51 |
+
logger.warning(f"Failed to download HTML from URL: {url}")
|
52 |
+
return None
|
53 |
|
54 |
+
def check_links(self, base_url, html_content):
|
55 |
+
soup = bs4.BeautifulSoup(html_content, "html.parser")
|
56 |
+
for link in soup.find_all("a"):
|
57 |
+
href = link.get("href")
|
58 |
|
59 |
+
if not href or href.startswith("#"):
|
60 |
+
continue
|
61 |
+
elif not href.startswith("https"):
|
62 |
+
href = href.replace("http", "https")
|
63 |
|
64 |
+
absolute_url = urljoin(base_url, href)
|
65 |
+
link['href'] = absolute_url
|
|
|
66 |
|
67 |
+
resp = requests.head(absolute_url)
|
68 |
+
if resp.status_code != 200:
|
69 |
+
logger.warning(f"Link {absolute_url} is broken")
|
70 |
+
logger.warning(f"Status code: {resp.status_code}")
|
71 |
+
|
72 |
+
return str(soup)
|
73 |
|
74 |
+
def html_to_md(self, url, html_content):
|
75 |
+
html_processed = self.check_links(url, html_content)
|
76 |
+
markdown_content = html2text.html2text(html_processed)
|
77 |
+
return markdown_content
|
78 |
+
|
79 |
+
def read_html(self, url):
|
80 |
+
html_content = self.read_url(url)
|
81 |
+
if html_content:
|
82 |
+
return self.html_to_md(url, html_content)
|
83 |
+
else:
|
84 |
+
return None
|
85 |
|
86 |
class FileReader:
|
87 |
+
def __init__(self, logger, kind):
|
|
|
88 |
self.logger = logger
|
89 |
+
self.kind = kind
|
90 |
+
if kind == "llama":
|
91 |
+
self.pdf_reader = LlamaParser()
|
92 |
+
else:
|
93 |
+
self.pdf_reader = PDFReader()
|
94 |
+
self.web_reader = HTMLReader()
|
95 |
+
|
96 |
|
97 |
def extract_text_from_pdf(self, pdf_path):
|
98 |
text = ""
|
|
|
104 |
text += page.extract_text()
|
105 |
return text
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
def read_pdf(self, temp_file_path: str):
|
108 |
+
if self.kind == "llama":
|
109 |
+
documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
|
110 |
+
else:
|
111 |
+
loader = self.pdf_reader.get_loader(temp_file_path)
|
112 |
+
documents = self.pdf_reader.get_documents(loader)
|
113 |
return documents
|
114 |
|
115 |
def read_txt(self, temp_file_path: str):
|
|
|
224 |
"https://dl4ds.github.io/sp2024/lectures/",
|
225 |
"https://dl4ds.github.io/sp2024/schedule/",
|
226 |
) # For any additional metadata
|
|
|
227 |
with ThreadPoolExecutor() as executor:
|
228 |
executor.map(
|
229 |
self.process_file,
|
|
|
289 |
)
|
290 |
self.document_chunks_full.extend(document_chunks)
|
291 |
|
292 |
+
print(f"Processed {file_path}. File_data: {file_data}")
|
293 |
self.document_data[file_path] = file_data
|
294 |
self.document_metadata[file_path] = file_metadata
|
295 |
|
296 |
def process_file(self, file_path, file_index, file_reader, addl_metadata):
|
297 |
file_name = os.path.basename(file_path)
|
298 |
+
|
299 |
if file_name in self.document_data:
|
300 |
return
|
301 |
|
302 |
+
file_type = file_name.split(".")[-1]
|
|
|
303 |
|
304 |
read_methods = {
|
305 |
"pdf": file_reader.read_pdf,
|
|
|
314 |
|
315 |
try:
|
316 |
documents = read_methods[file_type](file_path)
|
317 |
+
|
318 |
self.process_documents(
|
319 |
documents, file_path, file_type, "file", addl_metadata
|
320 |
)
|
|
|
376 |
|
377 |
class DataLoader:
|
378 |
def __init__(self, config, logger=None):
|
379 |
+
self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
|
380 |
self.chunk_processor = ChunkProcessor(config, logger=logger)
|
381 |
|
382 |
def get_chunks(self, uploaded_files, weblinks):
|
|
|
394 |
with open("../code/modules/config/config.yml", "r") as f:
|
395 |
config = yaml.safe_load(f)
|
396 |
|
397 |
+
STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
|
398 |
+
uploaded_files = [
|
399 |
+
os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
|
400 |
+
]
|
401 |
+
|
402 |
data_loader = DataLoader(config, logger=logger)
|
403 |
document_chunks, document_names, documents, document_metadata = (
|
404 |
data_loader.get_chunks(
|
405 |
+
["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
|
406 |
[],
|
|
|
407 |
)
|
408 |
)
|
409 |
|
410 |
+
print(document_names[:5])
|
411 |
print(len(document_chunks))
|
412 |
+
|
code/modules/dataloader/helpers.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
-
from
|
4 |
-
|
5 |
|
6 |
def get_urls_from_file(file_path: str):
|
7 |
"""
|
@@ -106,3 +106,23 @@ def get_metadata(lectures_url, schedule_url):
|
|
106 |
continue
|
107 |
|
108 |
return lecture_metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
+
from urllib.parse import urlparse
|
4 |
+
import tempfile
|
5 |
|
6 |
def get_urls_from_file(file_path: str):
|
7 |
"""
|
|
|
106 |
continue
|
107 |
|
108 |
return lecture_metadata
|
109 |
+
|
110 |
+
|
111 |
+
def download_pdf_from_url(pdf_url):
|
112 |
+
"""
|
113 |
+
Function to temporarily download a PDF file from a URL and return the local file path.
|
114 |
+
|
115 |
+
Args:
|
116 |
+
pdf_url (str): The URL of the PDF file to download.
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
str: The local file path of the downloaded PDF file.
|
120 |
+
"""
|
121 |
+
response = requests.get(pdf_url)
|
122 |
+
if response.status_code == 200:
|
123 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
124 |
+
temp_file.write(response.content)
|
125 |
+
temp_file_path = temp_file.name
|
126 |
+
return temp_file_path
|
127 |
+
else:
|
128 |
+
return None
|
code/modules/dataloader/pdf_readers/base.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
2 |
+
|
3 |
+
|
4 |
+
class PDFReader:
|
5 |
+
def __init__(self):
|
6 |
+
pass
|
7 |
+
|
8 |
+
def get_loader(self, pdf_path):
|
9 |
+
loader = PyMuPDFLoader(pdf_path)
|
10 |
+
return loader
|
11 |
+
|
12 |
+
def parse(self, pdf_path):
|
13 |
+
loader = self.get_loader(pdf_path)
|
14 |
+
return loader.load()
|
code/modules/dataloader/pdf_readers/llama.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
from llama_parse import LlamaParse
|
4 |
+
from langchain.schema import Document
|
5 |
+
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
6 |
+
from modules.dataloader.helpers import download_pdf_from_url
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
class LlamaParser:
|
11 |
+
def __init__(self):
|
12 |
+
self.GPT_API_KEY = OPENAI_API_KEY
|
13 |
+
self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
|
14 |
+
self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
|
15 |
+
self.headers = {
|
16 |
+
'Accept': 'application/json',
|
17 |
+
'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
|
18 |
+
}
|
19 |
+
self.parser = LlamaParse(
|
20 |
+
api_key=LLAMA_CLOUD_API_KEY,
|
21 |
+
result_type="markdown",
|
22 |
+
verbose=True,
|
23 |
+
language="en",
|
24 |
+
gpt4o_mode=False,
|
25 |
+
# gpt4o_api_key=OPENAI_API_KEY,
|
26 |
+
parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
|
27 |
+
)
|
28 |
+
|
29 |
+
def parse(self, pdf_path):
|
30 |
+
if not os.path.exists(pdf_path):
|
31 |
+
pdf_path = download_pdf_from_url(pdf_path)
|
32 |
+
|
33 |
+
documents = self.parser.load_data(pdf_path)
|
34 |
+
document = [document.to_langchain_format() for document in documents][0]
|
35 |
+
|
36 |
+
content = document.page_content
|
37 |
+
pages = content.split("\n---\n")
|
38 |
+
pages = [page.strip() for page in pages]
|
39 |
+
|
40 |
+
documents = [
|
41 |
+
Document(
|
42 |
+
page_content=page,
|
43 |
+
metadata={"source": pdf_path, "page": i}
|
44 |
+
) for i, page in enumerate(pages)
|
45 |
+
]
|
46 |
+
|
47 |
+
return documents
|
48 |
+
|
49 |
+
def make_request(self, pdf_url):
|
50 |
+
payload = {
|
51 |
+
"gpt4o_mode": "false",
|
52 |
+
"parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
|
53 |
+
}
|
54 |
+
|
55 |
+
files = [
|
56 |
+
('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
|
57 |
+
]
|
58 |
+
|
59 |
+
response = requests.request(
|
60 |
+
"POST", self.parse_url, headers=self.headers, data=payload, files=files)
|
61 |
+
|
62 |
+
return response.json()['id'], response.json()['status']
|
63 |
+
|
64 |
+
async def get_result(self, job_id):
|
65 |
+
url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
|
66 |
+
|
67 |
+
response = requests.request("GET", url, headers=self.headers, data={})
|
68 |
+
|
69 |
+
return response.json()['markdown']
|
70 |
+
|
71 |
+
async def _parse(self, pdf_path):
|
72 |
+
job_id, status = self.make_request(pdf_path)
|
73 |
+
|
74 |
+
while status != "SUCCESS":
|
75 |
+
url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
|
76 |
+
response = requests.request("GET", url, headers=self.headers, data={})
|
77 |
+
status = response.json()["status"]
|
78 |
+
|
79 |
+
result = await self.get_result(job_id)
|
80 |
+
|
81 |
+
documents = [
|
82 |
+
Document(
|
83 |
+
page_content=result,
|
84 |
+
metadata={"source": pdf_path}
|
85 |
+
)
|
86 |
+
]
|
87 |
+
|
88 |
+
return documents
|
89 |
+
|
90 |
+
async def _parse(self, pdf_path):
|
91 |
+
return await self._parse(pdf_path)
|
92 |
+
|
code/modules/dataloader/webpage_crawler.py
CHANGED
@@ -66,7 +66,6 @@ class WebpageCrawler:
|
|
66 |
)
|
67 |
for link in unchecked_links:
|
68 |
dict_links[link] = "Checked"
|
69 |
-
print(f"Checked: {link}")
|
70 |
dict_links.update(
|
71 |
{
|
72 |
link: "Not-checked"
|
|
|
66 |
)
|
67 |
for link in unchecked_links:
|
68 |
dict_links[link] = "Checked"
|
|
|
69 |
dict_links.update(
|
70 |
{
|
71 |
link: "Not-checked"
|