XThomasBU commited on
Commit
1afc595
2 Parent(s): ae2ff9e 6f6768d

Merge branch 'dev_branch' into chainlit_enhancements

Browse files
.github/workflows/push_to_hf_space_prototype.yml CHANGED
@@ -1,20 +1,21 @@
1
  name: Push Prototype to HuggingFace
2
 
3
  on:
4
- pull_request:
5
- branches:
6
- - dev_branch
7
-
 
8
 
9
  jobs:
10
- build:
11
  runs-on: ubuntu-latest
12
  steps:
13
- - name: Deploy Prototype to HuggingFace
14
- uses: nateraw/huggingface-sync-action@v0.0.4
15
- with:
16
- github_repo_id: DL4DS/dl4ds_tutor
17
- huggingface_repo_id: dl4ds/tutor_dev
18
- repo_type: space
19
- space_sdk: static
20
- hf_token: ${{ secrets.HF_TOKEN }}
 
1
  name: Push Prototype to HuggingFace
2
 
3
  on:
4
+ push:
5
+ branches: [dev_branch]
6
+
7
+ # run this workflow manuall from the Actions tab
8
+ workflow_dispatch:
9
 
10
  jobs:
11
+ sync-to-hub:
12
  runs-on: ubuntu-latest
13
  steps:
14
+ - uses: actions/checkout@v4
15
+ with:
16
+ fetch-depth: 0
17
+ lfs: true
18
+ - name: Deploy Prototype to HuggingFace
19
+ env:
20
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
21
+ run: git push https://trgardos:$HF_TOKEN@huggingface.co/spaces/dl4ds/tutor_dev dev_branch:main
code/.chainlit/config.toml CHANGED
@@ -85,31 +85,34 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
85
  # custom_build = "./public/build"
86
 
87
  [UI.theme]
88
- default = "light"
89
  #layout = "wide"
90
  #font_family = "Inter, sans-serif"
91
  # Override default MUI light theme. (Check theme.ts)
92
  [UI.theme.light]
93
- background = "#FAFAFA"
94
- paper = "#FFFFFF"
95
 
96
  [UI.theme.light.primary]
97
- main = "#b22222" # Brighter shade of red
98
- dark = "#8b0000" # Darker shade of the brighter red
99
- light = "#ff6347" # Lighter shade of the brighter red
100
  [UI.theme.light.text]
101
- primary = "#212121"
102
- secondary = "#616161"
 
103
  # Override default MUI dark theme. (Check theme.ts)
104
  [UI.theme.dark]
105
- background = "#1C1C1C" # Slightly lighter dark background color
106
- paper = "#2A2A2A" # Slightly lighter dark paper color
107
 
108
  [UI.theme.dark.primary]
109
- main = "#89CFF0" # Primary color
110
- dark = "#3700B3" # Dark variant of primary color
111
- light = "#CFBCFF" # Lighter variant of primary color
112
-
 
 
113
 
114
  [meta]
115
- generated_by = "1.1.302"
 
85
  # custom_build = "./public/build"
86
 
87
  [UI.theme]
88
+ default = "dark"
89
  #layout = "wide"
90
  #font_family = "Inter, sans-serif"
91
  # Override default MUI light theme. (Check theme.ts)
92
  [UI.theme.light]
93
+ #background = "#FAFAFA"
94
+ #paper = "#FFFFFF"
95
 
96
  [UI.theme.light.primary]
97
+ #main = "#F80061"
98
+ #dark = "#980039"
99
+ #light = "#FFE7EB"
100
  [UI.theme.light.text]
101
+ #primary = "#212121"
102
+ #secondary = "#616161"
103
+
104
  # Override default MUI dark theme. (Check theme.ts)
105
  [UI.theme.dark]
106
+ #background = "#FAFAFA"
107
+ #paper = "#FFFFFF"
108
 
109
  [UI.theme.dark.primary]
110
+ #main = "#F80061"
111
+ #dark = "#980039"
112
+ #light = "#FFE7EB"
113
+ [UI.theme.dark.text]
114
+ #primary = "#EEEEEE"
115
+ #secondary = "#BDBDBD"
116
 
117
  [meta]
118
+ generated_by = "1.1.304"
code/modules/chat/chat_model_loader.py CHANGED
@@ -5,6 +5,8 @@ from langchain_community.llms import LlamaCpp
5
  import torch
6
  import transformers
7
  import os
 
 
8
  from langchain.callbacks.manager import CallbackManager
9
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
10
  from modules.config.constants import LLAMA_PATH
@@ -15,6 +17,14 @@ class ChatModelLoader:
15
  self.config = config
16
  self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
17
 
 
 
 
 
 
 
 
 
18
  def load_chat_model(self):
19
  if self.config["llm_params"]["llm_loader"] in [
20
  "gpt-3.5-turbo-1106",
@@ -24,6 +34,9 @@ class ChatModelLoader:
24
  llm = ChatOpenAI(model_name=self.config["llm_params"]["llm_loader"])
25
  elif self.config["llm_params"]["llm_loader"] == "local_llm":
26
  n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
 
 
 
27
  llm = LlamaCpp(
28
  model_path=LLAMA_PATH,
29
  n_batch=n_batch,
 
5
  import torch
6
  import transformers
7
  import os
8
+ from pathlib import Path
9
+ from huggingface_hub import hf_hub_download
10
  from langchain.callbacks.manager import CallbackManager
11
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
12
  from modules.config.constants import LLAMA_PATH
 
17
  self.config = config
18
  self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
19
 
20
+ def _verify_model_cache(self, model_cache_path):
21
+ hf_hub_download(
22
+ repo_id=self.config["llm_params"]["local_llm_params"]["repo_id"],
23
+ filename=self.config["llm_params"]["local_llm_params"]["filename"],
24
+ cache_dir=model_cache_path,
25
+ )
26
+ return str(list(Path(model_cache_path).glob("*/snapshots/*/*.gguf"))[0])
27
+
28
  def load_chat_model(self):
29
  if self.config["llm_params"]["llm_loader"] in [
30
  "gpt-3.5-turbo-1106",
 
34
  llm = ChatOpenAI(model_name=self.config["llm_params"]["llm_loader"])
35
  elif self.config["llm_params"]["llm_loader"] == "local_llm":
36
  n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
37
+ model_path = self._verify_model_cache(
38
+ self.config["llm_params"]["local_llm_params"]["model"]
39
+ )
40
  llm = LlamaCpp(
41
  model_path=LLAMA_PATH,
42
  n_batch=n_batch,
code/modules/config/config.yml CHANGED
@@ -35,6 +35,9 @@ llm_params:
35
  temperature: 0.7 # float
36
  local_llm_params:
37
  temperature: 0.7 # float
 
 
 
38
  stream: False # bool
39
 
40
  chat_logging:
@@ -52,4 +55,4 @@ splitter_options:
52
  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
53
  front_chunks_to_remove : null # int or None
54
  last_chunks_to_remove : null # int or None
55
- delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings
 
35
  temperature: 0.7 # float
36
  local_llm_params:
37
  temperature: 0.7 # float
38
+ repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
39
+ filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
40
+ pdf_reader: 'llama' # str [llama, pymupdf, gpt]
41
  stream: False # bool
42
 
43
  chat_logging:
 
55
  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
56
  front_chunks_to_remove : null # int or None
57
  last_chunks_to_remove : null # int or None
58
+ delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings
code/modules/config/constants.py CHANGED
@@ -6,6 +6,7 @@ load_dotenv()
6
  # API Keys - Loaded from the .env file
7
 
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
9
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
10
  LITERAL_API_KEY_LOGGING = os.getenv("LITERAL_API_KEY_LOGGING")
11
  LITERAL_API_URL = os.getenv("LITERAL_API_URL")
@@ -17,6 +18,6 @@ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me question
17
 
18
  # Model Paths
19
 
20
- LLAMA_PATH = "../storage/models/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf"
21
 
22
  RETRIEVER_HF_PATHS = {"RAGatouille": "XThomasBU/Colbert_Index"}
 
6
  # API Keys - Loaded from the .env file
7
 
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
9
+ LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
10
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
11
  LITERAL_API_KEY_LOGGING = os.getenv("LITERAL_API_KEY_LOGGING")
12
  LITERAL_API_URL = os.getenv("LITERAL_API_URL")
 
18
 
19
  # Model Paths
20
 
21
+ LLAMA_PATH = "../storage/models/tinyllama"
22
 
23
  RETRIEVER_HF_PATHS = {"RAGatouille": "XThomasBU/Colbert_Index"}
code/modules/dataloader/data_loader.py CHANGED
@@ -20,26 +20,79 @@ from langchain_community.llms import OpenAI
20
  from langchain import PromptTemplate
21
  import json
22
  from concurrent.futures import ThreadPoolExecutor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- from modules.dataloader.helpers import get_metadata
 
 
 
 
 
 
25
 
 
 
 
 
26
 
27
- class PDFReader:
28
- def __init__(self):
29
- pass
 
30
 
31
- def get_loader(self, pdf_path):
32
- loader = PyMuPDFLoader(pdf_path)
33
- return loader
34
 
35
- def get_documents(self, loader):
36
- return loader.load()
 
 
 
 
37
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  class FileReader:
40
- def __init__(self, logger):
41
- self.pdf_reader = PDFReader()
42
  self.logger = logger
 
 
 
 
 
 
 
43
 
44
  def extract_text_from_pdf(self, pdf_path):
45
  text = ""
@@ -51,20 +104,12 @@ class FileReader:
51
  text += page.extract_text()
52
  return text
53
 
54
- def download_pdf_from_url(self, pdf_url):
55
- response = requests.get(pdf_url)
56
- if response.status_code == 200:
57
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
58
- temp_file.write(response.content)
59
- temp_file_path = temp_file.name
60
- return temp_file_path
61
- else:
62
- self.logger.error(f"Failed to download PDF from URL: {pdf_url}")
63
- return None
64
-
65
  def read_pdf(self, temp_file_path: str):
66
- loader = self.pdf_reader.get_loader(temp_file_path)
67
- documents = self.pdf_reader.get_documents(loader)
 
 
 
68
  return documents
69
 
70
  def read_txt(self, temp_file_path: str):
@@ -179,7 +224,6 @@ class ChunkProcessor:
179
  "https://dl4ds.github.io/sp2024/lectures/",
180
  "https://dl4ds.github.io/sp2024/schedule/",
181
  ) # For any additional metadata
182
-
183
  with ThreadPoolExecutor() as executor:
184
  executor.map(
185
  self.process_file,
@@ -245,16 +289,17 @@ class ChunkProcessor:
245
  )
246
  self.document_chunks_full.extend(document_chunks)
247
 
 
248
  self.document_data[file_path] = file_data
249
  self.document_metadata[file_path] = file_metadata
250
 
251
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
252
  file_name = os.path.basename(file_path)
 
253
  if file_name in self.document_data:
254
  return
255
 
256
- file_type = file_name.split(".")[-1].lower()
257
- self.logger.info(f"Reading file {file_index + 1}: {file_path}")
258
 
259
  read_methods = {
260
  "pdf": file_reader.read_pdf,
@@ -269,6 +314,7 @@ class ChunkProcessor:
269
 
270
  try:
271
  documents = read_methods[file_type](file_path)
 
272
  self.process_documents(
273
  documents, file_path, file_type, "file", addl_metadata
274
  )
@@ -330,7 +376,7 @@ class ChunkProcessor:
330
 
331
  class DataLoader:
332
  def __init__(self, config, logger=None):
333
- self.file_reader = FileReader(logger=logger)
334
  self.chunk_processor = ChunkProcessor(config, logger=logger)
335
 
336
  def get_chunks(self, uploaded_files, weblinks):
@@ -348,13 +394,19 @@ if __name__ == "__main__":
348
  with open("../code/modules/config/config.yml", "r") as f:
349
  config = yaml.safe_load(f)
350
 
 
 
 
 
 
351
  data_loader = DataLoader(config, logger=logger)
352
  document_chunks, document_names, documents, document_metadata = (
353
  data_loader.get_chunks(
 
354
  [],
355
- ["https://dl4ds.github.io/sp2024/"],
356
  )
357
  )
358
 
359
- print(document_names)
360
  print(len(document_chunks))
 
 
20
  from langchain import PromptTemplate
21
  import json
22
  from concurrent.futures import ThreadPoolExecutor
23
+ from urllib.parse import urljoin
24
+ import html2text
25
+ import bs4
26
+ import tempfile
27
+ import PyPDF2
28
+ from modules.dataloader.pdf_readers.base import PDFReader
29
+ from modules.dataloader.pdf_readers.llama import LlamaParser
30
+
31
+ try:
32
+ from modules.dataloader.helpers import get_metadata, download_pdf_from_url
33
+ from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
34
+ except:
35
+ from dataloader.helpers import get_metadata, download_pdf_from_url
36
+ from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
37
+
38
+ logger = logging.getLogger(__name__)
39
+ BASE_DIR = os.getcwd()
40
+
41
+
42
+ class HTMLReader:
43
+ def __init__(self):
44
+ pass
45
 
46
+ def read_url(self, url):
47
+ response = requests.get(url)
48
+ if response.status_code == 200:
49
+ return response.text
50
+ else:
51
+ logger.warning(f"Failed to download HTML from URL: {url}")
52
+ return None
53
 
54
+ def check_links(self, base_url, html_content):
55
+ soup = bs4.BeautifulSoup(html_content, "html.parser")
56
+ for link in soup.find_all("a"):
57
+ href = link.get("href")
58
 
59
+ if not href or href.startswith("#"):
60
+ continue
61
+ elif not href.startswith("https"):
62
+ href = href.replace("http", "https")
63
 
64
+ absolute_url = urljoin(base_url, href)
65
+ link['href'] = absolute_url
 
66
 
67
+ resp = requests.head(absolute_url)
68
+ if resp.status_code != 200:
69
+ logger.warning(f"Link {absolute_url} is broken")
70
+ logger.warning(f"Status code: {resp.status_code}")
71
+
72
+ return str(soup)
73
 
74
+ def html_to_md(self, url, html_content):
75
+ html_processed = self.check_links(url, html_content)
76
+ markdown_content = html2text.html2text(html_processed)
77
+ return markdown_content
78
+
79
+ def read_html(self, url):
80
+ html_content = self.read_url(url)
81
+ if html_content:
82
+ return self.html_to_md(url, html_content)
83
+ else:
84
+ return None
85
 
86
  class FileReader:
87
+ def __init__(self, logger, kind):
 
88
  self.logger = logger
89
+ self.kind = kind
90
+ if kind == "llama":
91
+ self.pdf_reader = LlamaParser()
92
+ else:
93
+ self.pdf_reader = PDFReader()
94
+ self.web_reader = HTMLReader()
95
+
96
 
97
  def extract_text_from_pdf(self, pdf_path):
98
  text = ""
 
104
  text += page.extract_text()
105
  return text
106
 
 
 
 
 
 
 
 
 
 
 
 
107
  def read_pdf(self, temp_file_path: str):
108
+ if self.kind == "llama":
109
+ documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
110
+ else:
111
+ loader = self.pdf_reader.get_loader(temp_file_path)
112
+ documents = self.pdf_reader.get_documents(loader)
113
  return documents
114
 
115
  def read_txt(self, temp_file_path: str):
 
224
  "https://dl4ds.github.io/sp2024/lectures/",
225
  "https://dl4ds.github.io/sp2024/schedule/",
226
  ) # For any additional metadata
 
227
  with ThreadPoolExecutor() as executor:
228
  executor.map(
229
  self.process_file,
 
289
  )
290
  self.document_chunks_full.extend(document_chunks)
291
 
292
+ print(f"Processed {file_path}. File_data: {file_data}")
293
  self.document_data[file_path] = file_data
294
  self.document_metadata[file_path] = file_metadata
295
 
296
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
297
  file_name = os.path.basename(file_path)
298
+
299
  if file_name in self.document_data:
300
  return
301
 
302
+ file_type = file_name.split(".")[-1]
 
303
 
304
  read_methods = {
305
  "pdf": file_reader.read_pdf,
 
314
 
315
  try:
316
  documents = read_methods[file_type](file_path)
317
+
318
  self.process_documents(
319
  documents, file_path, file_type, "file", addl_metadata
320
  )
 
376
 
377
  class DataLoader:
378
  def __init__(self, config, logger=None):
379
+ self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
380
  self.chunk_processor = ChunkProcessor(config, logger=logger)
381
 
382
  def get_chunks(self, uploaded_files, weblinks):
 
394
  with open("../code/modules/config/config.yml", "r") as f:
395
  config = yaml.safe_load(f)
396
 
397
+ STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
398
+ uploaded_files = [
399
+ os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
400
+ ]
401
+
402
  data_loader = DataLoader(config, logger=logger)
403
  document_chunks, document_names, documents, document_metadata = (
404
  data_loader.get_chunks(
405
+ ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
406
  [],
 
407
  )
408
  )
409
 
410
+ print(document_names[:5])
411
  print(len(document_chunks))
412
+
code/modules/dataloader/helpers.py CHANGED
@@ -1,7 +1,7 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
- from tqdm import tqdm
4
-
5
 
6
  def get_urls_from_file(file_path: str):
7
  """
@@ -106,3 +106,23 @@ def get_metadata(lectures_url, schedule_url):
106
  continue
107
 
108
  return lecture_metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ from urllib.parse import urlparse
4
+ import tempfile
5
 
6
  def get_urls_from_file(file_path: str):
7
  """
 
106
  continue
107
 
108
  return lecture_metadata
109
+
110
+
111
+ def download_pdf_from_url(pdf_url):
112
+ """
113
+ Function to temporarily download a PDF file from a URL and return the local file path.
114
+
115
+ Args:
116
+ pdf_url (str): The URL of the PDF file to download.
117
+
118
+ Returns:
119
+ str: The local file path of the downloaded PDF file.
120
+ """
121
+ response = requests.get(pdf_url)
122
+ if response.status_code == 200:
123
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
124
+ temp_file.write(response.content)
125
+ temp_file_path = temp_file.name
126
+ return temp_file_path
127
+ else:
128
+ return None
code/modules/dataloader/pdf_readers/base.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyMuPDFLoader
2
+
3
+
4
+ class PDFReader:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def get_loader(self, pdf_path):
9
+ loader = PyMuPDFLoader(pdf_path)
10
+ return loader
11
+
12
+ def parse(self, pdf_path):
13
+ loader = self.get_loader(pdf_path)
14
+ return loader.load()
code/modules/dataloader/pdf_readers/llama.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from llama_parse import LlamaParse
4
+ from langchain.schema import Document
5
+ from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
6
+ from modules.dataloader.helpers import download_pdf_from_url
7
+
8
+
9
+
10
+ class LlamaParser:
11
+ def __init__(self):
12
+ self.GPT_API_KEY = OPENAI_API_KEY
13
+ self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
14
+ self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
15
+ self.headers = {
16
+ 'Accept': 'application/json',
17
+ 'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
18
+ }
19
+ self.parser = LlamaParse(
20
+ api_key=LLAMA_CLOUD_API_KEY,
21
+ result_type="markdown",
22
+ verbose=True,
23
+ language="en",
24
+ gpt4o_mode=False,
25
+ # gpt4o_api_key=OPENAI_API_KEY,
26
+ parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
27
+ )
28
+
29
+ def parse(self, pdf_path):
30
+ if not os.path.exists(pdf_path):
31
+ pdf_path = download_pdf_from_url(pdf_path)
32
+
33
+ documents = self.parser.load_data(pdf_path)
34
+ document = [document.to_langchain_format() for document in documents][0]
35
+
36
+ content = document.page_content
37
+ pages = content.split("\n---\n")
38
+ pages = [page.strip() for page in pages]
39
+
40
+ documents = [
41
+ Document(
42
+ page_content=page,
43
+ metadata={"source": pdf_path, "page": i}
44
+ ) for i, page in enumerate(pages)
45
+ ]
46
+
47
+ return documents
48
+
49
+ def make_request(self, pdf_url):
50
+ payload = {
51
+ "gpt4o_mode": "false",
52
+ "parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
53
+ }
54
+
55
+ files = [
56
+ ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
57
+ ]
58
+
59
+ response = requests.request(
60
+ "POST", self.parse_url, headers=self.headers, data=payload, files=files)
61
+
62
+ return response.json()['id'], response.json()['status']
63
+
64
+ async def get_result(self, job_id):
65
+ url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
66
+
67
+ response = requests.request("GET", url, headers=self.headers, data={})
68
+
69
+ return response.json()['markdown']
70
+
71
+ async def _parse(self, pdf_path):
72
+ job_id, status = self.make_request(pdf_path)
73
+
74
+ while status != "SUCCESS":
75
+ url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
76
+ response = requests.request("GET", url, headers=self.headers, data={})
77
+ status = response.json()["status"]
78
+
79
+ result = await self.get_result(job_id)
80
+
81
+ documents = [
82
+ Document(
83
+ page_content=result,
84
+ metadata={"source": pdf_path}
85
+ )
86
+ ]
87
+
88
+ return documents
89
+
90
+ async def _parse(self, pdf_path):
91
+ return await self._parse(pdf_path)
92
+
code/modules/dataloader/webpage_crawler.py CHANGED
@@ -66,7 +66,6 @@ class WebpageCrawler:
66
  )
67
  for link in unchecked_links:
68
  dict_links[link] = "Checked"
69
- print(f"Checked: {link}")
70
  dict_links.update(
71
  {
72
  link: "Not-checked"
 
66
  )
67
  for link in unchecked_links:
68
  dict_links[link] = "Checked"
 
69
  dict_links.update(
70
  {
71
  link: "Not-checked"