XThomasBU commited on
Commit
dbc26b1
1 Parent(s): 849b2ae

commit to add lecture pdfs in context

Browse files
code/modules/data_loader.py CHANGED
@@ -48,6 +48,27 @@ class DataLoader:
48
  self.splitter = None
49
  logger.info("InfoLoader instance created")
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def get_chunks(self, uploaded_files, weblinks):
52
  # Main list of all documents
53
  self.document_chunks_full = []
@@ -78,6 +99,13 @@ class DataLoader:
78
  logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
79
  return document_chunks
80
 
 
 
 
 
 
 
 
81
  def get_pdf(temp_file_path: str, title: str):
82
  """
83
  Function to process PDF files
@@ -201,7 +229,10 @@ class DataLoader:
201
 
202
  # Handle different file types
203
  if file_type == "pdf":
204
- title, document_chunks = get_pdf(file_path, file_name)
 
 
 
205
  elif file_type == "txt":
206
  title, document_chunks = get_txt(file_path, file_name)
207
  elif file_type == "docx":
@@ -215,7 +246,7 @@ class DataLoader:
215
  if self.config["splitter_options"]["remove_chunks"]:
216
  document_chunks = remove_chunks(document_chunks)
217
 
218
- logger.info(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
219
  self.document_names.append(title)
220
  self.document_chunks_full.extend(document_chunks)
221
 
@@ -243,6 +274,7 @@ class DataLoader:
243
  self.document_chunks_full.extend(document_chunks)
244
  except:
245
  logger.info(f"\t\tError splitting link {link_index+1} : {link}")
 
246
 
247
  logger.info(
248
  f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
 
48
  self.splitter = None
49
  logger.info("InfoLoader instance created")
50
 
51
+ def extract_text_from_pdf(self, pdf_path):
52
+ text = ""
53
+ with open(pdf_path, "rb") as file:
54
+ reader = PyPDF2.PdfReader(file)
55
+ num_pages = len(reader.pages)
56
+ for page_num in range(num_pages):
57
+ page = reader.pages[page_num]
58
+ text += page.extract_text()
59
+ return text
60
+
61
+ def download_pdf_from_url(self, pdf_url):
62
+ response = requests.get(pdf_url)
63
+ if response.status_code == 200:
64
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
65
+ temp_file.write(response.content)
66
+ temp_file_path = temp_file.name
67
+ return temp_file_path
68
+ else:
69
+ print("Failed to download PDF from URL:", pdf_url)
70
+ return None
71
+
72
  def get_chunks(self, uploaded_files, weblinks):
73
  # Main list of all documents
74
  self.document_chunks_full = []
 
99
  logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
100
  return document_chunks
101
 
102
+ def get_pdf_from_url(pdf_url: str):
103
+ temp_pdf_path = self.download_pdf_from_url(pdf_url)
104
+ if temp_pdf_path:
105
+ title, document_chunks = get_pdf(temp_pdf_path, pdf_url)
106
+ os.remove(temp_pdf_path)
107
+ return title, document_chunks
108
+
109
  def get_pdf(temp_file_path: str, title: str):
110
  """
111
  Function to process PDF files
 
229
 
230
  # Handle different file types
231
  if file_type == "pdf":
232
+ try:
233
+ title, document_chunks = get_pdf(file_path, file_name)
234
+ except:
235
+ title, document_chunks = get_pdf_from_url(file_path)
236
  elif file_type == "txt":
237
  title, document_chunks = get_txt(file_path, file_name)
238
  elif file_type == "docx":
 
246
  if self.config["splitter_options"]["remove_chunks"]:
247
  document_chunks = remove_chunks(document_chunks)
248
 
249
+ logger.info(f"\t\tExtracted no. of chunks: {len(document_chunks)} from {file_name}")
250
  self.document_names.append(title)
251
  self.document_chunks_full.extend(document_chunks)
252
 
 
274
  self.document_chunks_full.extend(document_chunks)
275
  except:
276
  logger.info(f"\t\tError splitting link {link_index+1} : {link}")
277
+ exit()
278
 
279
  logger.info(
280
  f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
code/modules/helpers.py CHANGED
@@ -36,6 +36,10 @@ class WebpageCrawler:
36
  soup = BeautifulSoup(html_data, "html.parser")
37
  list_links = []
38
  for link in soup.find_all("a", href=True):
 
 
 
 
39
  # Append to list if new link contains original link
40
  if str(link["href"]).startswith((str(website_link))):
41
  list_links.append(link["href"])
@@ -56,14 +60,19 @@ class WebpageCrawler:
56
 
57
  def get_subpage_links(self, l, base_url):
58
  for link in tqdm(l):
59
- # If not crawled through this page start crawling and get links
60
- if l[link] == "Not-checked":
61
- dict_links_subpages = self.get_links(link, base_url)
62
- # Change the dictionary value of the link to "Checked"
63
  l[link] = "Checked"
64
- else:
65
- # Create an empty dictionary in case every link is checked
66
  dict_links_subpages = {}
 
 
 
 
 
 
 
 
 
67
  # Add new dictionary to old dictionary
68
  l = {**dict_links_subpages, **l}
69
  return l
 
36
  soup = BeautifulSoup(html_data, "html.parser")
37
  list_links = []
38
  for link in soup.find_all("a", href=True):
39
+
40
+ # clean the link
41
+ # remove empty spaces
42
+ link["href"] = link["href"].strip()
43
  # Append to list if new link contains original link
44
  if str(link["href"]).startswith((str(website_link))):
45
  list_links.append(link["href"])
 
60
 
61
  def get_subpage_links(self, l, base_url):
62
  for link in tqdm(l):
63
+ print('checking link:', link)
64
+ if not link.endswith("/"):
 
 
65
  l[link] = "Checked"
 
 
66
  dict_links_subpages = {}
67
+ else:
68
+ # If not crawled through this page start crawling and get links
69
+ if l[link] == "Not-checked":
70
+ dict_links_subpages = self.get_links(link, base_url)
71
+ # Change the dictionary value of the link to "Checked"
72
+ l[link] = "Checked"
73
+ else:
74
+ # Create an empty dictionary in case every link is checked
75
+ dict_links_subpages = {}
76
  # Add new dictionary to old dictionary
77
  l = {**dict_links_subpages, **l}
78
  return l
code/modules/vector_db.py CHANGED
@@ -60,6 +60,14 @@ class VectorDB:
60
  urls = all_urls
61
  return files, urls
62
 
 
 
 
 
 
 
 
 
63
  def create_embedding_model(self):
64
  self.logger.info("Creating embedding function")
65
  self.embedding_model_loader = EmbeddingModelLoader(self.config)
@@ -79,6 +87,9 @@ class VectorDB:
79
  data_loader = DataLoader(self.config)
80
  self.logger.info("Loading data")
81
  files, urls = self.load_files()
 
 
 
82
  document_chunks, document_names = data_loader.get_chunks(files, urls)
83
  self.logger.info("Completed loading data")
84
 
 
60
  urls = all_urls
61
  return files, urls
62
 
63
+ def clean_url_list(self, urls):
64
+ # get lecture pdf links
65
+ lecture_pdfs = [link for link in urls if link.endswith(".pdf")]
66
+ lecture_pdfs = [link for link in lecture_pdfs if "lecture" in link.lower()]
67
+ urls = [link for link in urls if link.endswith("/")] # only keep links that end with a '/'. Extract Files Seperately
68
+
69
+ return urls, lecture_pdfs
70
+
71
  def create_embedding_model(self):
72
  self.logger.info("Creating embedding function")
73
  self.embedding_model_loader = EmbeddingModelLoader(self.config)
 
87
  data_loader = DataLoader(self.config)
88
  self.logger.info("Loading data")
89
  files, urls = self.load_files()
90
+ urls, lecture_pdfs = self.clean_url_list(urls)
91
+ files += lecture_pdfs
92
+ files.remove('storage/data/urls.txt')
93
  document_chunks, document_names = data_loader.get_chunks(files, urls)
94
  self.logger.info("Completed loading data")
95
 
requirements.txt CHANGED
@@ -16,3 +16,4 @@ beautifulsoup4==4.12.2
16
  fake-useragent==1.4.0
17
  git+https://github.com/huggingface/accelerate.git
18
  llama-cpp-python
 
 
16
  fake-useragent==1.4.0
17
  git+https://github.com/huggingface/accelerate.git
18
  llama-cpp-python
19
+ PyPDF2==3.0.1